From 3453b964a3ed84d99c9ae33bc0fae00790df36ef Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 15 Mar 2025 18:46:17 -0700 Subject: [PATCH 01/34] [Misc][Doc] Minor benchmark README update (#14874) Signed-off-by: Roger Wang --- benchmarks/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index c64c24fd3ad05..3225a4b0db3a0 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -82,10 +82,10 @@ Then run the benchmarking script # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" NUM_PROMPTS=10 -BACKEND="openai-chat" +BACKEND="vllm" DATASET_NAME="sharegpt" DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" -python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} +python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} ``` If successful, you will see the following output From def232e122624504e49f1e5ff0ae01a7285de1a3 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 16 Mar 2025 09:53:52 +0800 Subject: [PATCH 02/34] [VLM] Clean up Phi-4-MM ViT implementation (#14812) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- requirements/test.in | 1 + requirements/test.txt | 2 + .../vision_language/test_phi4mm.py | 229 ++ vllm/model_executor/models/aria.py | 4 +- .../models/idefics2_vision_model.py | 57 +- vllm/model_executor/models/phi4mm.py | 45 +- .../models/vision_siglip_navit.py | 1966 ----------------- 7 files changed, 316 insertions(+), 1988 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_phi4mm.py delete mode 100644 vllm/model_executor/models/vision_siglip_navit.py diff --git a/requirements/test.in b/requirements/test.in index cc89d518c7eec..c171e8d41ddc2 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -8,6 +8,7 @@ pytest-shard # testing utils awscli +backoff # required for phi4mm test decord # required for video tests einops # required for MPT, qwen-vl and Mamba httpx diff --git a/requirements/test.txt b/requirements/test.txt index c2cdd2c8664d8..10fb1f14c3a18 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -33,6 +33,8 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements/test.in +backoff==2.2.1 + # via -r requirements/test.in bitsandbytes==0.45.3 # via -r requirements/test.in black==24.10.0 diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py new file mode 100644 index 0000000000000..fb69beaf77598 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_phi4mm.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import re +from typing import Optional + +import pytest +from huggingface_hub import snapshot_download +from transformers import AutoTokenizer + +from vllm.lora.request import LoRARequest +from vllm.multimodal.image import rescale_image_size +from vllm.platforms import current_platform +from vllm.sequence import SampleLogprobs + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ...utils import check_logprobs_close + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 + "cherry_blossom": + "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501 +}) +HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501 + +model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct") +# Since the vision-lora and speech-lora co-exist with the base model, +# we have to manually specify the path of the lora weights. +vision_lora_path = os.path.join(model_path, "vision-lora") +models = [model_path] + + +def vllm_to_hf_output(vllm_output: tuple[list[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + _, output_str, out_logprobs = vllm_output + + output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str) + assert output_str_without_image[0] == " " + output_str_without_image = output_str_without_image[1:] + + hf_output_str = output_str_without_image + "<|end|><|endoftext|>" + + tokenizer = AutoTokenizer.from_pretrained(model) + hf_output_ids = tokenizer.encode(output_str_without_image) + assert hf_output_ids[0] == 1 + hf_output_ids = hf_output_ids[1:] + + return hf_output_ids, hf_output_str, out_logprobs + + +target_dtype = "half" + +# ROCm Triton FA can run into shared memory issues with these models, +# use other backends in the meantime +# FIXME (mattwong, gshtrasb, hongxiayan) +if current_platform.is_rocm(): + os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" + + +def run_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput]], + model: str, + *, + max_model_len: int, + dtype: str, + max_tokens: int, + num_logprobs: int, + mm_limit: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test are from IMAGE_ASSETS. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding MultiModalConfig as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + # max_model_len should be greater than image_feature_size + with vllm_runner( + model, + task="generate", + max_model_len=max_model_len, + max_num_seqs=2, + dtype=dtype, + limit_mm_per_prompt={"image": mm_limit}, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enable_lora=True, + max_lora_rank=320, + lora_extra_vocab_size=0, + gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI + enforce_eager=True, + ) as vllm_model: + lora_request = LoRARequest("vision", 1, vision_lora_path) + vllm_model.model.llm_engine.add_lora(lora_request=lora_request) + vllm_outputs_per_case = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs + ] + + # use eager mode for hf runner, since phi3_v didn't work with flash_attn + hf_model_kwargs = {"_attn_implementation": "eager"} + with hf_runner(model, dtype=dtype, + model_kwargs=hf_model_kwargs) as hf_model: + eos_token_id = hf_model.processor.tokenizer.eos_token_id + hf_outputs_per_case = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + eos_token_id=eos_token_id, + num_logits_to_keep=0) + for prompts, images in inputs + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, + vllm_outputs_per_case): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +# Since we use _attn_implementation="eager" for hf_runner, there is more +# significant numerical difference. The basic `logprobs=5` fails to pass. +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.7, 0.75, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_model_len", [4096]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_model_len: int, max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + run_test( + hf_runner, + vllm_runner, + inputs_per_image, + model, + dtype=dtype, + max_model_len=max_model_len, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + # [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_model_len", [10000]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +@pytest.mark.xfail( + reason="Phi-4-MM multi-image inference is divergent with hf model.") +def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, + size_factors, dtype: str, max_model_len: int, + max_tokens: int, num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case = [ + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors]) + ] + + run_test( + hf_runner, + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_model_len=max_model_len, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=2, + tensor_parallel_size=1, + ) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index ecd0a04b1dff7..8cd3be90ca8da 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -60,7 +60,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant): quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: - super().__init__(config, quant_config, prefix) + super().__init__(config, quant_config=quant_config, prefix=prefix) # Unlike Idefics3VisionTransformer which uses LayerNorm after the # final layer, Aria omits this normalization, so we replace it with an # Identity layer @@ -512,7 +512,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): self.config = config self.vision_tower = AriaVisionTransformer( config.vision_config, - quant_config, + quant_config=quant_config, prefix=f"{prefix}.vision_tower", ) self.multi_modal_projector = AriaProjector(config) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index f9c2175b29881..cb0379c10f3a6 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -113,7 +113,7 @@ class Idefics2VisionAttention(nn.Module): def __init__( self, - config: Idefics2Config, + config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -164,7 +164,7 @@ class Idefics2VisionMLP(nn.Module): def __init__( self, - config: Idefics2Config, + config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -249,16 +249,24 @@ class Idefics2Encoder(nn.Module): self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, prefix: str = "", ) -> None: super().__init__() self.config = config + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + self.layers = nn.ModuleList([ Idefics2EncoderLayer(config, quant_config=quant_config, prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.num_hidden_layers) + for layer_idx in range(num_hidden_layers) ]) def forward( @@ -287,6 +295,9 @@ class Idefics2VisionTransformer(nn.Module): self, config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: bool = True, prefix: str = "", ) -> None: super().__init__() @@ -294,11 +305,24 @@ class Idefics2VisionTransformer(nn.Module): embed_dim = config.hidden_size self.config = config self.embeddings = Idefics2VisionEmbeddings(config) - self.encoder = Idefics2Encoder(config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.post_layernorm = nn.LayerNorm(embed_dim, - eps=config.layer_norm_eps) + self.encoder = Idefics2Encoder( + config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder") + + num_hidden_layers = config.num_hidden_layers + if len(self.encoder.layers) > config.num_hidden_layers: + raise ValueError( + f"The original encoder only has {num_hidden_layers} " + f"layers, but you requested {len(self.encoder.layers)} layers." + ) + + self.require_post_norm = require_post_norm + self.post_layernorm = nn.LayerNorm( + embed_dim, + eps=config.layer_norm_eps, + ) if require_post_norm else nn.Identity() def get_input_embeddings(self): return self.embeddings @@ -328,7 +352,24 @@ class Idefics2VisionTransformer(nn.Module): ] params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() + layer_count = len(self.encoder.layers) + for name, loaded_weight in weights: + # skip pooling header + if name.startswith("head."): + continue + + # post_layernorm is optional + if (name.startswith("post_layernorm.") + and not self.require_post_norm): + continue + + # omit layers when num_hidden_layers_override is set + if name.startswith("encoder.layers."): + layer_idx = int(name.split(".")[2]) + if layer_idx >= layer_count: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 2a839f3a50317..7250aaba557eb 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -11,7 +11,7 @@ import torch import torch.nn as nn import torchvision.transforms as T from PIL import Image -from transformers import PretrainedConfig +from transformers import PretrainedConfig, SiglipVisionConfig from transformers.utils import logging from vllm.config import VllmConfig @@ -32,10 +32,10 @@ from vllm.multimodal.inputs import MultiModalInputs, NestedTensors from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal from .phi4mm_audio import AudioEmbedding from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix -from .vision_siglip_navit import get_siglip_vision_model # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -339,6 +339,33 @@ def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size): return data +def get_navit_vision_model(layer_idx: int = -1, **kwargs): + vision_config = { + "hidden_size": 1152, + "image_size": 448, + "intermediate_size": 4304, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } + + model_config = SiglipVisionConfig(**vision_config, **kwargs) + if layer_idx < 0: + num_hidden_layers = model_config.num_hidden_layers \ + + layer_idx + 1 + else: + num_hidden_layers = layer_idx + 1 + + vision_model = Idefics2VisionTransformer( + config=model_config, + require_post_norm=False, + num_hidden_layers_override=num_hidden_layers, + ) + + return vision_model + + class Phi4MMImageEncoder(nn.Module): """Image embedding.""" @@ -362,8 +389,7 @@ class Phi4MMImageEncoder(nn.Module): self.layer_idx = -2 self.type_feature = 'patch' - self.img_processor = get_siglip_vision_model( - _flash_attn_2_enabled=True) + self.img_processor = get_navit_vision_model(layer_idx=self.layer_idx) pe_weight = self.img_processor.embeddings.position_embedding.weight L, D = pe_weight.size() @@ -430,16 +456,11 @@ class Phi4MMImageEncoder(nn.Module): def get_img_features(self, img_embeds: torch.FloatTensor, attention_mask=None) -> torch.FloatTensor: - LAYER_IDX = self.layer_idx - TYPE_FEATURE = self.type_feature - img_processor_output = self.img_processor( - img_embeds, - output_hidden_states=True, - patch_attention_mask=attention_mask) - img_feature = img_processor_output.hidden_states[LAYER_IDX] + img_feature = self.img_processor(img_embeds, + patch_attention_mask=attention_mask) - if TYPE_FEATURE == "patch": + if self.type_feature == "patch": patch_feature = img_feature use_token_compression = self.image_token_compression is not None diff --git a/vllm/model_executor/models/vision_siglip_navit.py b/vllm/model_executor/models/vision_siglip_navit.py deleted file mode 100644 index 3a9597a845ff9..0000000000000 --- a/vllm/model_executor/models/vision_siglip_navit.py +++ /dev/null @@ -1,1966 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Siglip model configuration""" - -import math -import os -import warnings -from dataclasses import dataclass -from typing import Any, Optional, Tuple, Union - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn.init import _calculate_fan_in_and_fan_out -from transformers.activations import ACT2FN -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -from transformers.modeling_outputs import (BaseModelOutput, - BaseModelOutputWithPooling) -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import (ModelOutput, add_start_docstrings, - add_start_docstrings_to_model_forward, logging, - replace_return_docstrings) - -from vllm.platforms import _Backend - -from .vision import get_vit_attn_backend - -logger = logging.get_logger(__name__) - -SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "google/siglip-base-patch16-224": - "https://huggingface.co/google/siglip-base-patch16-224/"\ - "resolve/main/config.json", -} - - -class SiglipTextConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a - [`SiglipTextModel`]. It is used to instantiate a Siglip text encoder - according to the specified arguments, defining the model architecture. - Instantiating a configuration with the defaults will yield a similar - configuration to that of the text encoder of the Siglip [google/ - siglip-base-patch16-224](https://huggingface.co/google/siglip-base - -patch16-224) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used - to control the model outputs. Read the documentation from - [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Siglip text model. Defines the number of - different tokens that can be represented by the `inputs_ids` - passed when calling [`SiglipModel`]. - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer - in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the - Transformer encoder. - max_position_embeddings (`int`, *optional*, defaults to 64): - The maximum sequence length that this model might ever be used - with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - hidden_act (`str` or `function`, *optional*, defaults to - `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the - encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - pad_token_id (`int`, *optional*, defaults to 1): - The id of the padding token in the vocabulary. - bos_token_id (`int`, *optional*, defaults to 49406): - The id of the beginning-of-sequence token in the vocabulary. - eos_token_id (`int`, *optional*, defaults to 49407): - The id of the end-of-sequence token in the vocabulary. - Example: - ```python - >>> from transformers import SiglipTextConfig, SiglipTextModel - >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 - style configuration - >>> configuration = SiglipTextConfig() - >>> # Initializing a SiglipTextModel (with random weights) from the - google/siglip-base-patch16-224 style configuration - >>> model = SiglipTextModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "siglip_text_model" - - def __init__( - self, - vocab_size=32000, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - max_position_embeddings=64, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - # This differs from `CLIPTokenizer`'s default and from openai/siglip - # See https://github.com/huggingface/transformers/pull/24773# - # issuecomment-1632287538 - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - _flash_attn_2_enabled=True, - **kwargs, - ): - super().__init__(pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self._flash_attn_2_enabled = _flash_attn_2_enabled - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, - os.PathLike], - **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr( - cls, - "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - "You are using a model of type %s to instantiate a model of " - "type %s. This is not supported for all configurations of " - "models and can yield errors.", config_dict['model_type'], - cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class SiglipVisionConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a - [`SiglipVisionModel`]. It is used to instantiate a - Siglip vision encoder according to the specified arguments, defining the - model architecture. Instantiating a configuration with the defaults will - yield a similar configuration to that of the vision encoder of the Siglip - [google/siglip-base-patch16-224](https://huggingface.co/google/ - siglip-base-patch16-224) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used - to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer - in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the - Transformer encoder. - num_channels (`int`, *optional*, defaults to 3): - Number of channels in the input images. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. - hidden_act (`str` or `function`, *optional*, defaults to - `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the - encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and - `"gelu_new"` ``"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - Example: - ```python - >>> from transformers import SiglipVisionConfig, SiglipVisionModel - >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 - style configuration - >>> configuration = SiglipVisionConfig() - >>> # Initializing a SiglipVisionModel (with random weights) from the - google/siglip-base-patch16-224 style configuration - >>> model = SiglipVisionModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "siglip_vision_model" - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - _flash_attn_2_enabled=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self._flash_attn_2_enabled = _flash_attn_2_enabled - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, - os.PathLike], - **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr( - cls, - "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - "You are using a model of type %s to " - "instantiate a model of type %s. This is not" - " supported for all configurations of models and can yield" - " errors.", config_dict['model_type'], cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class SiglipConfig(PretrainedConfig): - r""" - [`SiglipConfig`] is the configuration class to store the configuration of a - [`SiglipModel`]. It is used to instantiate a Siglip model according to the - specified arguments, defining the text model and vision model configs. - Instantiating a configuration with the defaults will yield a similar - configuration to that of the Siglip [google/siglip-base-patch16-224]( - https://huggingface.co/google/siglip-base-patch16-224) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used to - control the model outputs. Read the documentation from - [`PretrainedConfig`] for more information. - Args: - text_config (`dict`, *optional*): - Dictionary of configuration options used to initialize - [`SiglipTextConfig`]. - vision_config (`dict`, *optional*): - Dictionary of configuration options used to initialize - [`SiglipVisionConfig`]. - kwargs (*optional*): - Dictionary of keyword arguments. - Example: - ```python - >>> from transformers import SiglipConfig, SiglipModel - >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 - style configuration - >>> configuration = SiglipConfig() - >>> # Initializing a SiglipModel (with random weights) from the - google/siglip-base-patch16-224 style configuration - >>> model = SiglipModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - >>> # We can also initialize a SiglipConfig from a SiglipTextConfig - and a SiglipVisionConfig - >>> from transformers import SiglipTextConfig, SiglipVisionConfig - >>> # Initializing a SiglipText and SiglipVision configuration - >>> config_text = SiglipTextConfig() - >>> config_vision = SiglipVisionConfig() - >>> config = SiglipConfig.from_text_vision_configs(config_text, - config_vision) - ```""" - - model_type = "siglip" - - def __init__(self, text_config=None, vision_config=None, **kwargs): - super().__init__(**kwargs) - - if text_config is None: - text_config = {} - logger.info( - "`text_config` is `None`. Initializing the `SiglipTextConfig`" - " with default values.") - - if vision_config is None: - vision_config = {} - logger.info("`vision_config` is `None`. initializing the " - "`SiglipVisionConfig` with default values.") - - self.text_config = SiglipTextConfig(**text_config) - self.vision_config = SiglipVisionConfig(**vision_config) - - self.initializer_factor = 1.0 - - @classmethod - def from_text_vision_configs(cls, text_config: SiglipTextConfig, - vision_config: SiglipVisionConfig, **kwargs): - r""" - Instantiate a [`SiglipConfig`] (or a derived class) from siglip text - model configuration and siglip vision - model configuration. - Returns: - [`SiglipConfig`]: An instance of a configuration object - """ - - return cls(text_config=text_config.to_dict(), - vision_config=vision_config.to_dict(), - **kwargs) - - -# coding=utf-8 -# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Siglip model.""" - -_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" - -SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "google/siglip-base-patch16-224", - # See all SigLIP models at https://huggingface.co/models?filter=siglip -] - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad( - torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -def _trunc_normal_(tensor, mean, std, a, b): - # Cut & paste from PyTorch official master until it's in a few official - # releases - RW - # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/ - # truncated_normal.pdf - def norm_cdf(x): - # Computes standard normal cumulative distribution function - return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 - - if (mean < a - 2 * std) or (mean > b + 2 * std): - warnings.warn( - "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " - "The distribution of values may be incorrect.", - stacklevel=2, - ) - - # Values are generated by using a truncated uniform distribution and - # then using the inverse CDF for the normal distribution. - # Get upper and lower cdf values - l = norm_cdf((a - mean) / std) # noqa - u = norm_cdf((b - mean) / std) # noqa - - # Uniformly fill tensor with values from [l, u], then translate to - # [2l-1, 2u-1]. - tensor.uniform_(2 * l - 1, 2 * u - 1) - - # Use inverse cdf transform for normal distribution to get truncated - # standard normal - if tensor.dtype in [torch.float16, torch.bfloat16]: - # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu - og_dtype = tensor.dtype - tensor = tensor.to(torch.float32) - tensor.erfinv_() - tensor = tensor.to(og_dtype) - else: - tensor.erfinv_() - - # Transform to proper mean, std - tensor.mul_(std * math.sqrt(2.0)) - tensor.add_(mean) - - # Clamp to ensure it's in the proper range - if tensor.dtype == torch.float16: - # The `clamp_` op is not (yet?) defined in float16+cpu - tensor = tensor.to(torch.float32) - tensor.clamp_(min=a, max=b) - tensor = tensor.to(torch.float16) - else: - tensor.clamp_(min=a, max=b) - - -def trunc_normal_tf_(tensor: torch.Tensor, - mean: float = 0.0, - std: float = 1.0, - a: float = -2.0, - b: float = 2.0) -> torch.Tensor: - """Fills the input Tensor with values drawn from a truncated - normal distribution. The values are effectively drawn from the - normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` - with values outside :math:`[a, b]` redrawn until they are within - the bounds. The method used for generating the random values works - best when :math:`a \\leq \text{mean} \\leq b`. - NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where - the bounds [a, b] are applied when sampling the normal distribution with - mean=0, std=1.0 and the result is subsequently scaled and shifted by the - mean and std args. - Args: - tensor: an n-dimensional `torch.Tensor` - mean: the mean of the normal distribution - std: the standard deviation of the normal distribution - a: the minimum cutoff value - b: the maximum cutoff value - """ - with torch.no_grad(): - _trunc_normal_(tensor, 0, 1.0, a, b) - tensor.mul_(std).add_(mean) - - -def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) - if mode == "fan_in": - denom = fan_in - elif mode == "fan_out": - denom = fan_out - elif mode == "fan_avg": - denom = (fan_in + fan_out) / 2 - - variance = scale / denom - - if distribution == "truncated_normal": - # constant is stddev of standard normal truncated to (-2, 2) - trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) - elif distribution == "normal": - with torch.no_grad(): - tensor.normal_(std=math.sqrt(variance)) - elif distribution == "uniform": - bound = math.sqrt(3 * variance) - with torch.no_grad(): - tensor.uniform_(-bound, bound) - else: - raise ValueError(f"invalid distribution {distribution}") - - -def lecun_normal_(tensor): - variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") - - -def default_flax_embed_init(tensor): - variance_scaling_(tensor, mode="fan_in", distribution="normal") - - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with -# CLIP->Siglip -class SiglipVisionModelOutput(ModelOutput): - """ - Base class for vision model's outputs that also contains image embeddings - of the pooling of the last hidden states. - Args: - image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` - *optional* returned when model is initialized with - `with_projection=True`): - The image embeddings obtained by applying the projection layer to - the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the - model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, - if the model has an embedding layer, + one for the output of each - layer) of shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the - weighted average in the self-attention heads. - """ - - image_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with -# CLIP->Siglip -class SiglipTextModelOutput(ModelOutput): - """ - Base class for text model's outputs that also contains a pooling of the - last hidden states. - Args: - text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` - *optional* returned when model is initialized with - `with_projection=True`): - The text embeddings obtained by applying the projection layer to - model. - the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the - embeddings, if the model has an embedding layer, + one for the - output of each layer) of shape `(batch_size, sequence_length, - hidden_size)`. - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute - the weighted average in the self-attention heads. - """ - - text_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPOutput with -# CLIP->Siglip -class SiglipOutput(ModelOutput): - """ - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when - `return_loss` is `True`): - Contrastive loss for image-text similarity. - logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, - text_batch_size)`): - The scaled dot product scores between `image_embeds` and - `text_embeds`. This represents the image-text similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, - image_batch_size)`): - The scaled dot product scores between `text_embeds` and - `image_embeds`. This represents the text-image similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): - The text embeddings obtained by applying the projection layer to - the pooled output of [`SiglipTextModel`]. - image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): - The image embeddings obtained by applying the projection layer to - the pooled output of [`SiglipVisionModel`]. - text_model_output(`BaseModelOutputWithPooling`): - The output of the [`SiglipTextModel`]. - vision_model_output(`BaseModelOutputWithPooling`): - The output of the [`SiglipVisionModel`]. - """ - - loss: Optional[torch.FloatTensor] = None - logits_per_image: torch.FloatTensor = None - logits_per_text: torch.FloatTensor = None - text_embeds: torch.FloatTensor = None - image_embeds: torch.FloatTensor = None - text_model_output: BaseModelOutputWithPooling = None - vision_model_output: BaseModelOutputWithPooling = None - - def to_tuple(self) -> Tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output" - ] else getattr(self, k).to_tuple() - for k in self.keys()) - - -class SiglipVisionEmbeddings(nn.Module): - - def __init__(self, config: SiglipVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - padding="valid", - ) - - self.num_patches_per_side = self.image_size // self.patch_size - self.num_patches = self.num_patches_per_side**2 - self.num_positions = self.num_patches - self.position_embedding = nn.Embedding(self.num_positions, - self.embed_dim) - - def forward(self, pixel_values: torch.FloatTensor, - patch_attention_mask: torch.BoolTensor) -> torch.Tensor: - batch_size = pixel_values.size(0) - - patch_embeds = self.patch_embedding(pixel_values) - embeddings = patch_embeds.flatten(2).transpose(1, 2) - - max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) - max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, \ - max_im_w // self.patch_size - boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, - 1 / self.num_patches_per_side) - position_ids = torch.full( - size=( - batch_size, - max_nb_patches_h * max_nb_patches_w, - ), - fill_value=0, - ) - - for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() - - fractional_coords_h = torch.linspace(0, 1 - 1 / nb_patches_h, - nb_patches_h) - fractional_coords_w = torch.linspace(0, 1 - 1 / nb_patches_w, - nb_patches_w) - - bucket_coords_h = torch.bucketize(fractional_coords_h, - boundaries, - right=True) - bucket_coords_w = torch.bucketize(fractional_coords_w, - boundaries, - right=True) - - pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + - bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids - - position_ids = position_ids.to(self.position_embedding.weight.device) - - embeddings = embeddings + self.position_embedding(position_ids) - return embeddings - - -# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with -# CLIP->Siglip -class SiglipTextEmbeddings(nn.Module): - - def __init__(self, config: SiglipTextConfig): - super().__init__() - embed_dim = config.hidden_size - - self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) - self.position_embedding = nn.Embedding(config.max_position_embeddings, - embed_dim) - - # position_ids (1, len position emb) is contiguous in memory and - # exported when serialized - self.register_buffer( - "position_ids", - torch.arange(config.max_position_embeddings).expand((1, -1)), - persistent=False) - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.Tensor: - seq_length = input_ids.shape[ - -1] if input_ids is not None else inputs_embeds.shape[-2] - - if position_ids is None: - position_ids = self.position_ids[:, :seq_length] - - if inputs_embeds is None: - inputs_embeds = self.token_embedding(input_ids) - - position_embeddings = self.position_embedding(position_ids) - embeddings = inputs_embeds + position_embeddings - - return embeddings - - -class SiglipAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`:" - f" {self.embed_dim} and `num_heads`: {self.num_heads}).") - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - key_states = key_states.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - value_states = value_states.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - - k_v_seq_len = key_states.shape[-2] - attn_weights = torch.matmul(query_states, key_states.transpose( - 2, 3)) * self.scale - - if attn_weights.size() != (batch_size, self.num_heads, q_len, - k_v_seq_len): - raise ValueError( - f"Attention weights should be of size " - f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" - f" {attn_weights.size()}") - - if attention_mask is not None: - if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): - raise ValueError(f"Attention mask should be of size " - f"{(batch_size, 1, q_len, k_v_seq_len)}, " - f"but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, - dim=-1, - dtype=torch.float32).to( - query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, - p=self.dropout, - training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (batch_size, self.num_heads, q_len, - self.head_dim): - raise ValueError( - f"`attn_output` should be of size " - f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights - - -class SiglipFlashAttention2(SiglipAttention): - """ - Llama flash attention module. This module inherits from `LlamaAttention` as - the weights of the module stays untouched. The only required change would - be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any - of them. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.is_causal = False # Hack to make sure we don't use a causal mask - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length( - kv_seq_len, self.layer_idx) - - # TODO: These transpose are quite inefficient but Flash Attention - # requires the layout [batch_size, sequence_length, num_heads, - # head_dim]. We would need to refactor the KV cache - # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - dropout_rate = self.dropout if self.training else 0.0 - - # In PEFT, usually we cast the layer norms in float32 for training - # stability reasons therefore the input hidden states gets silently - # casted in float32. Hence, we need cast them back in the correct - # dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to - # not cast the LayerNorms in fp32. (LlamaRMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - "The input hidden states seems to be silently casted in " - "float32, this might be related to the fact you have upcasted " - "embedding or layer norm layers in float32. We will cast " - f"back the input in {target_dtype}.") - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = self._flash_attention_forward(query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate) - - attn_output = attn_output.reshape(bsz, q_len, - self.embed_dim).contiguous() - attn_output = self.out_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights - - def _flash_attention_forward(self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None): - """ - Calls the forward method of Flash Attention - if the input hidden - states contain at least one padding token first unpad the input, - then computes the attention scores and pad the final attention - scores. - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size - `(batch_size, seq_len)` where 0 stands for the position - of padding tokens and 1 for the position of non-padding - tokens. - dropout (`int`, *optional*): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / - sqrt(head_dim) - """ - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import pad_input # noqa - - # TODO: Remove the `query_length != 1` check once Flash Attention for - # RoCm is bumped to 2.1. For details, please see the comment in - # LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, \ - max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, - query_length) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, - query_length) - else: - attn_output = flash_attn_func(query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, - query_length): - from flash_attn.bert_padding import index_first_axis, unpad_input - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data( - attention_mask) - batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape - - key_layer = index_first_axis( - key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, - head_dim), indices_k) - value_layer = index_first_axis( - value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, - head_dim), indices_k) - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, self.num_heads, - head_dim), indices_k) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = \ - unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip -class SiglipMLP(nn.Module): - - def __init__(self, config): - super().__init__() - self.config = config - self.activation_fn = ACT2FN[config.hidden_act] - self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) - self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - hidden_states = self.fc2(hidden_states) - return hidden_states - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with -# CLIP->Siglip -class SiglipEncoderLayer(nn.Module): - - def __init__(self, config: SiglipConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = (SiglipAttention(config) if - not getattr(config, "_flash_attn_2_enabled", False) - else SiglipFlashAttention2(config)) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, - eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, - eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(batch, seq_len, embed_dim)`. - attention_mask (`torch.FloatTensor`): - Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where - padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all - attention layers. See `attentions` under returned tensors for - more detail. - """ - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - ) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states, ) - - if output_attentions: - outputs += (attn_weights, ) - - return outputs - - -class SiglipPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface - for downloading and loading pretrained models. - """ - - config_class = SiglipConfig - base_model_prefix = "siglip" - supports_gradient_checkpointing = True - - def _init_weights(self, module): - """Initialize the weights""" - - if isinstance(module, SiglipVisionEmbeddings): - width = (self.config.vision_config.hidden_size if isinstance( - self.config, SiglipConfig) else self.config.hidden_size) - nn.init.normal_(module.position_embedding.weight, - std=1 / np.sqrt(width)) - elif isinstance(module, nn.Embedding): - default_flax_embed_init(module.weight) - elif isinstance(module, SiglipAttention): - nn.init.normal_(module.q_proj.weight) - nn.init.normal_(module.k_proj.weight) - nn.init.normal_(module.v_proj.weight) - nn.init.normal_(module.out_proj.weight) - nn.init.zeros_(module.q_proj.bias) - nn.init.zeros_(module.k_proj.bias) - nn.init.zeros_(module.v_proj.bias) - nn.init.zeros_(module.out_proj.bias) - elif isinstance(module, SiglipMLP): - nn.init.normal_(module.fc1.weight) - nn.init.normal_(module.fc2.weight) - nn.init.normal_(module.fc1.bias, std=1e-6) - nn.init.normal_(module.fc2.bias, std=1e-6) - elif isinstance(module, SiglipMultiheadAttentionPoolingHead): - nn.init.normal_(module.probe.data) - nn.init.normal_(module.attention.in_proj_weight.data) - nn.init.zeros_(module.attention.in_proj_bias.data) - elif isinstance(module, SiglipModel): - logit_scale_init = torch.tensor(0.0) - module.logit_scale.data.fill_(logit_scale_init) - module.logit_bias.data.zero_() - elif isinstance(module, (nn.Linear, nn.Conv2d)): - lecun_normal_(module.weight) - if module.bias is not None: - nn.init.zeros_(module.bias) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - -SIGLIP_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass - documentation for the generic methods the library implements for all - its model (such as downloading or saving, resizing the input embeddings, - pruning heads etc.) - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/ - stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation - for all matter related to general usage and behavior. - Parameters: - config ([`SiglipConfig`]): Model configuration class with all the - parameters of the model. - Initializing with a config file does not load the weights - associated with the model, only the configuration. Check out - the [`~PreTrainedModel.from_pretrained`] method to load the - model weights. -""" - -SIGLIP_TEXT_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length) - `): - Indices of input sequence tokens in the vocabulary. Padding will - be ignored by default should you provide it. - Indices can be obtained using [`AutoTokenizer`]. See - [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] - for details. [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, - sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask - values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - position_ids (`torch.LongTensor` of shape `(batch_size, - sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position - embeddings. Selected in the range `[0, - config.max_position_embeddings - 1]`. - [What are position IDs?](../glossary#position-ids) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. -""" - -SIGLIP_VISION_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, - num_channels, height, width)`): - Pixel values. Padding will be ignored by default should you - provide it. Pixel values can be obtained using - [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] - for details. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. -""" - -SIGLIP_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, - sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding - will be ignored by default should you provide it. - Indices can be obtained using [`AutoTokenizer`]. See - [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] - for details. [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)` - , *optional*): - Mask to avoid performing attention on padding token indices. Mask - values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - position_ids (`torch.LongTensor` of shape `(batch_size, - sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position - embeddings. Selected in the range `[0, - config.max_position_embeddings - 1]`. - [What are position IDs?](../glossary#position-ids) - pixel_values (`torch.FloatTensor` of shape `(batch_size, - num_channels, height, width)`): - Pixel values. Padding will be ignored by default should you - provide it. Pixel values can be obtained using - [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] - for details. - return_loss (`bool`, *optional*): - Whether or not to return the contrastive loss. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. -""" - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with -# CLIP->Siglip -class SiglipEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` - self attention layers. Each layer is a [`SiglipEncoderLayer`]. - Args: - config: SiglipConfig - """ - - def __init__(self, config: SiglipConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([ - SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers) - ]) - self.gradient_checkpointing = False - - # Ignore copy - def forward( - self, - inputs_embeds, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. - This is useful if you want more control over how to convert - `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - attention_mask (`torch.Tensor` of shape `(batch_size, - sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. - Mask values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all - attention layers. See `attentions` under returned tensors for - more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. - """ - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - hidden_states = inputs_embeds - for encoder_layer in self.layers: - if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1], ) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - - if not return_dict: - return tuple( - v for v in [hidden_states, encoder_states, all_attentions] - if v is not None) - return BaseModelOutput(last_hidden_state=hidden_states, - hidden_states=encoder_states, - attentions=all_attentions) - - -class SiglipTextTransformer(nn.Module): - - def __init__(self, config: SiglipTextConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - self.embeddings = SiglipTextEmbeddings(config) - self.encoder = SiglipEncoder(config) - self.final_layer_norm = nn.LayerNorm(embed_dim, - eps=config.layer_norm_eps) - - self.head = nn.Linear(embed_dim, embed_dim) - - @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipTextConfig) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - """ - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states \ - is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - if input_ids is None: - raise ValueError("You have to specify input_ids") - - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - - hidden_states = self.embeddings(input_ids=input_ids, - position_ids=position_ids) - - # note: SigLIP's text model does not use a causal mask, unlike the - # original CLIP model. - # expand attention_mask - if attention_mask is not None: - # [batch_size, seq_len] -> - # [batch_size, 1, tgt_seq_len, src_seq_len] - attention_mask = _prepare_4d_attention_mask( - attention_mask, hidden_states.dtype) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.final_layer_norm(last_hidden_state) - - # Assuming "sticky" EOS tokenization, last token is always EOS. - pooled_output = last_hidden_state[:, -1, :] - pooled_output = self.head(pooled_output) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -@add_start_docstrings( - """The text model from SigLIP without any head or projection on top.""", - SIGLIP_START_DOCSTRING, -) -class SiglipTextModel(SiglipPreTrainedModel): - config_class = SiglipTextConfig - - _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"] - - def __init__(self, config: SiglipTextConfig): - super().__init__(config) - self.text_model = SiglipTextTransformer(config) - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.text_model.embeddings.token_embedding - - def set_input_embeddings(self, value): - self.text_model.embeddings.token_embedding = value - - @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipTextConfig) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - Examples: - ```python - >>> from transformers import AutoTokenizer, SiglipTextModel - >>> model = SiglipTextModel. - from_pretrained("google/siglip-base-patch16-224") - >>> tokenizer = AutoTokenizer. - from_pretrained("google/siglip-base-patch16-224") - >>> # important: make sure to set padding="max_length" - as that's how the model was trained - >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], - padding="max_length", return_tensors="pt") - >>> outputs = model(**inputs) - >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled (EOS token) - states - ```""" - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - return self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -class SiglipVisionTransformer(nn.Module): - - def __init__(self, config: SiglipVisionConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - - self.embeddings = SiglipVisionEmbeddings(config) - self.encoder = SiglipEncoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, - eps=config.layer_norm_eps) - self.head = SiglipMultiheadAttentionPoolingHead(config) - - @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipVisionConfig) - def forward( - self, - pixel_values, - patch_attention_mask: Optional[torch.BoolTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - """ - output_attentions = output_attentions if output_attentions is not None\ - else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None \ - else self.config.use_return_dict - - batch_size = pixel_values.size(0) - if patch_attention_mask is None: - patch_attention_mask = torch.ones( - size=( - batch_size, - pixel_values.size(2) // self.config.patch_size, - pixel_values.size(3) // self.config.patch_size, - ), - dtype=torch.bool, - device=pixel_values.device, - ) - - hidden_states = self.embeddings( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask) - - patch_attention_mask = patch_attention_mask.view(batch_size, -1) - # The call to `_upad_input` in `_flash_attention_forward` is expensive - # So when the `patch_attention_mask` is full of 1s (i.e. attending - # to the whole sequence), avoiding passing the attention_mask, which - # is equivalent to attending to the full sequence - if not torch.any(~patch_attention_mask): - attention_mask = None - else: - attention_mask = (_prepare_4d_attention_mask( - patch_attention_mask, hidden_states.dtype) - if not self.config._flash_attn_2_enabled else - patch_attention_mask) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.post_layernorm(last_hidden_state) - - pooled_output = self.head( - hidden_state=last_hidden_state, - attention_mask=patch_attention_mask, - ) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -class SiglipMultiheadAttentionPoolingHead(nn.Module): - """Multihead Attention Pooling.""" - - def __init__(self, config: SiglipVisionConfig): - super().__init__() - - self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) - self.attention = torch.nn.MultiheadAttention( - config.hidden_size, config.num_attention_heads, batch_first=True) - self.layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config) - - def forward(self, hidden_state, attention_mask): - batch_size = hidden_state.shape[0] - probe = self.probe.repeat(batch_size, 1, 1) - - hidden_state = self.attention(query=probe, - key=hidden_state, - value=hidden_state, - key_padding_mask=~attention_mask)[0] - - residual = hidden_state - hidden_state = self.layernorm(hidden_state) - hidden_state = residual + self.mlp(hidden_state) - - return hidden_state[:, 0] - - -@add_start_docstrings( - """The vision model from SigLIP without any head or projection on top.""", - SIGLIP_START_DOCSTRING, -) -class SiglipVisionModel(SiglipPreTrainedModel): - config_class = SiglipVisionConfig - main_input_name = "pixel_values" - - def __init__(self, config: SiglipVisionConfig): - super().__init__(config) - - self.vision_model = SiglipVisionTransformer(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipVisionConfig) - def forward( - self, - pixel_values, - patch_attention_mask: Optional[torch.BoolTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - Examples: - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, SiglipVisionModel - >>> model = SiglipVisionModel.from_pretrained( - "google/siglip-base-patch16-224") - >>> processor = AutoProcessor.from_pretrained( - "google/siglip-base-patch16-224") - >>> url = - "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(images=image, return_tensors="pt") - >>> outputs = model(**inputs) - >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled features - ```""" - return_dict = return_dict if return_dict is not None \ - else self.config.use_return_dict - - return self.vision_model( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -@add_start_docstrings(SIGLIP_START_DOCSTRING) -class SiglipModel(SiglipPreTrainedModel): - config_class = SiglipConfig - - def __init__(self, config: SiglipConfig): - super().__init__(config) - - if not isinstance(config.text_config, SiglipTextConfig): - raise ValueError("config.text_config is expected to be of type " - f"SiglipTextConfig but is of type" - f" {type(config.text_config)}.") - - if not isinstance(config.vision_config, SiglipVisionConfig): - raise ValueError("config.vision_config is expected to be of type " - "SiglipVisionConfig but is of type" - f" {type(config.vision_config)}.") - - text_config = config.text_config - vision_config = config.vision_config - - self.text_model = SiglipTextTransformer(text_config) - self.vision_model = SiglipVisionTransformer(vision_config) - - self.logit_scale = nn.Parameter(torch.randn(1)) - self.logit_bias = nn.Parameter(torch.randn(1)) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING) - def get_text_features( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: - r""" - Returns: - text_features (`torch.FloatTensor` of shape `(batch_size, - output_dim`): The text embeddings obtained by - applying the projection layer to the pooled output - of [`SiglipTextModel`]. - Examples: - ```python - >>> from transformers import AutoTokenizer, AutoModel - >>> import torch - >>> model = AutoModel.from_pretrained( - "google/siglip-base-patch16-224") - >>> tokenizer = AutoTokenizer.from_pretrained( - "google/siglip-base-patch16-224") - >>> # important: make sure to set padding="max_length" as that's - how the model was trained - >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], - padding="max_length", return_tensors="pt") - >>> with torch.no_grad(): - ... text_features = model.get_text_features(**inputs) - ```""" - # Use SigLIP model's config for some fields (if specified) instead - # of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None\ - else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None \ - else self.config.use_return_dict - - text_outputs = self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = text_outputs[1] - - return pooled_output - - @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING) - def get_image_features( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: - r""" - Returns: - image_features (`torch.FloatTensor` of shape `(batch_size, - output_dim`): The image embeddings obtained by applying the - projection layer to the pooled output of [`SiglipVisionModel`]. - Examples: - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, AutoModel - >>> import torch - >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224") - >>> processor = AutoProcessor.from_pretrained( - "google/siglip-base-patch16-224") - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(images=image, return_tensors="pt") - >>> with torch.no_grad(): - ... image_features = model.get_image_features(**inputs) - ```""" - # Use SiglipModel's config for some fields (if specified) instead - # of those of vision & text components. - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - vision_outputs = self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = vision_outputs[1] - - return pooled_output - - @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=SiglipOutput, - config_class=SiglipConfig) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - return_loss: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SiglipOutput]: - r""" - Returns: - Examples: - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, AutoModel - >>> import torch - >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224") - >>> processor = AutoProcessor.from_pretrained( - "google/siglip-base-patch16-224") - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"] - >>> # important: we pass `padding=max_length` since the model was - trained with this - >>> inputs = processor(text=texts, images=image, - padding="max_length", return_tensors="pt") - >>> with torch.no_grad(): - ... outputs = model(**inputs) - >>> logits_per_image = outputs.logits_per_image - >>> probs = torch.sigmoid(logits_per_image) # these are the - probabilities - >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") - 31.9% that image 0 is 'a photo of 2 cats' - ```""" - # Use SigLIP model's config for some fields (if specified) instead of - # those of vision & text components. - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - vision_outputs = self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - text_outputs = self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - image_embeds = vision_outputs[1] - text_embeds = text_outputs[1] - - # normalized features - image_embeds = image_embeds / image_embeds.norm( - p=2, dim=-1, keepdim=True) - text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) - - # cosine similarity as logits - logits_per_text = torch.matmul(text_embeds, image_embeds.t( - )) * self.logit_scale.exp() + self.logit_bias - logits_per_image = logits_per_text.t() - - loss = None - if return_loss: - raise NotImplementedError("SigLIP loss to be implemented") - - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, - image_embeds, text_outputs, vision_outputs) - return ((loss, ) + output) if loss is not None else output - - return SiglipOutput( - loss=loss, - logits_per_image=logits_per_image, - logits_per_text=logits_per_text, - text_embeds=text_embeds, - image_embeds=image_embeds, - text_model_output=text_outputs, - vision_model_output=vision_outputs, - ) - - -def get_siglip_vision_model(_flash_attn_2_enabled=True, **kwargs): - siglip_vision_config = { - "hidden_size": 1152, - "image_size": 448, - "intermediate_size": 4304, - "model_type": "siglip_vision_model", - "num_attention_heads": 16, - "num_hidden_layers": 27, - "patch_size": 14, - } - - # Detect attention implementation. - attn_backend: _Backend = get_vit_attn_backend(support_fa=True) - if attn_backend != _Backend.FLASH_ATTN: - _flash_attn_2_enabled = False - - model_config = SiglipVisionConfig( - **siglip_vision_config, - _flash_attn_2_enabled=_flash_attn_2_enabled, - **kwargs) - - vision_model = SiglipVisionModel(model_config).vision_model - - return vision_model From b30c75dda4f6c5e0d8b3d2b39134da38b72ea96e Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 15 Mar 2025 20:21:11 -0700 Subject: [PATCH 03/34] [V1] Remove V0 fallback for mistral-tokenizer (#14873) Signed-off-by: Roger Wang --- vllm/engine/arg_utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 31d567de0efa5..4e695da4ef765 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1487,13 +1487,6 @@ class EngineArgs: recommend_to_remove=False) return False - # No MistralTokenizer support so far (not compatible - # with xgrammar) - if model_config.tokenizer_mode == "mistral": - _raise_or_fallback(feature_name="--tokenizer-mode mistral", - recommend_to_remove=False) - return False - # No CPU offloading yet. if self.cpu_offload_gb != EngineArgs.cpu_offload_gb: _raise_or_fallback(feature_name="--cpu-offload-gb", From 71c1e0710783e1b0427610ba9e32bed7724fa36f Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sat, 15 Mar 2025 20:25:03 -0700 Subject: [PATCH 04/34] [Kernel] Add more tuned configs (#14877) Signed-off-by: simon-mo --- ...192,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...=64,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...280,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=1280,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=2560,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...320,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=320,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...640,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=640,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=14336,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=1792,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=2048,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=3584,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=4096,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=7168,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../layers/fused_moe/configs/README | 3 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 105 files changed, 13627 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..0611620eb3362 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4dd00d110e486 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json new file mode 100644 index 0000000000000..48f9697af2639 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..a8c05712ba587 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json new file mode 100644 index 0000000000000..f1244c61efb01 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..a2ee05da1d7c6 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..fc573cd6e8561 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c6d7e96c7f0ae --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..21f60229ff875 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..39a9912fa4bdd --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..05b54639d234e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..c17a4ec346915 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..170ae7f3fff1d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..1d9d352edebc3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..9ad5b31675005 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..2883dfd11e7f3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8abfd84a776b7 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..2fc18a5e43d29 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..be8d4a7fd23d9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..71fdd88643c6f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..c02de2f628b71 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..3e0bc75ff87c4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..9f7ed6726f44e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..21b72557e365d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..eaf32f6d76c0a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..841044a4fc6e2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..59be497fc4287 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..e4110a5d2e70f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..0883ef40582ea --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..1a0aa33193329 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..9952be6ba4abe --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..32bbadbb9eae8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..e6f753cdba35b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..53f3394693f06 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..4dd475c02a19b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..2ed15f30fe603 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..eb817268d4120 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..0c7062aea6c4e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..96cbc111c7fff --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README index 45d40cbfb1a2e..787bd06116646 100644 --- a/vllm/model_executor/layers/fused_moe/configs/README +++ b/vllm/model_executor/layers/fused_moe/configs/README @@ -8,3 +8,6 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration The example configurations provided are for the Mixtral model for TP2 on H100 and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have N = 7168 and for TP4 we have N = 3584. + +Please feel free to tune the configurations using scripts in `benchmarks/kernels/benchmark_moe.py` +Some of the configurations files are copied from the SGLang repository. Thank you! diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..3e8ebf3f7301c --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..2bb5b457d774a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6e2aeee9b75c2 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b0f9442a6aaa8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bee8d03ba47cf --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..9da876d3ccb43 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..0a1a252a5e032 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d6279a1e37b6f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..3bc003647cda8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..310dff4635c28 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..206c8a2bac667 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..edc23530ea745 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..43b5bdbdff5db --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bffa749724ad3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..f96f12787f6fb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..fe3e18cf01aa1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e4d5b2dd02a8c --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..137b9ddaca305 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..38cac4690a8a6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8e6ebe21fc3c6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..1225d847b7d5e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d44e38438c9f6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c559a69a77eed --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8ec2005f02e88 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..65840aa538bc6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4e120d6d08432 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..5c298746788d9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4990268b2a9eb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..18afdd96fbfb2 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..51d10bb0ee1a4 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..1480e09293213 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6bd350c388972 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..2b9f0d1ec64ed --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d979c6b66d048 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6eb22deb8dd2b --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c746e7080522d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..0b4746ceeb61d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8ec2005f02e88 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..202acf23f8ca7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..11a9bceb77c85 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..386ee59beae38 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..60df5e33eed5d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4f1747b81f58e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..53bbaca407af6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..ffe67dcf48c23 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..2a17e164e9ec7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b259993b617c3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..a71ab88d43c1e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..eda96e76cb6d9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bd0767b5ef66f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..29f7651876940 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6db13852c9d4e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..1a47cae9e17bd --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8dd5ae5c49715 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6d1a8b56a2831 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e77abaf396831 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..01327b2c4f907 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..12eea5fb6687a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..9db9daece8c18 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..365f8d0d8abc0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..f080ea5da7dd1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e9bf04442a91f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c37aced26e8d5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d6bef7f60c614 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8df6e4b6e5dc8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file From b82662d9523d9aa1386d8d1de410426781a1fa3b Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 15 Mar 2025 20:26:19 -0700 Subject: [PATCH 05/34] [BugFix] Fix torch distributed stateless PG backend init (#14870) Signed-off-by: Nick Hill --- examples/offline_inference/data_parallel.py | 5 +++++ vllm/distributed/utils.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index b00519314d8bd..b73770ce382cf 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -76,5 +76,10 @@ if __name__ == "__main__": GPUs_per_dp_rank)) proc.start() procs.append(proc) + exit_code = 0 for proc in procs: proc.join() + if proc.exitcode: + exit_code = proc.exitcode + + exit(exit_code) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 25202062e9757..84899358a6d66 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group( # different systems (e.g. RPC) in case the store is multi-tenant. prefix_store = PrefixStore(init_method, store) - pg_options = ProcessGroup.Options(backend=backend, timeout=timeout) - pg: ProcessGroup = ProcessGroup( prefix_store, group_rank, group_size, - pg_options, ) if backend == "gloo": @@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group( backend_options) backend_type = ProcessGroup.BackendType.NCCL device = torch.device("cuda") + else: + raise RuntimeError(f"Unsupported torch distributed backend: {backend}") + pg._set_default_backend(backend_type) backend_class._set_sequence_number_for_group() pg._register_backend(device, backend_type, backend_class) From d1ad2a57af72fb4c9bb4b6c7cfc58e0159693fc6 Mon Sep 17 00:00:00 2001 From: Lily Liu Date: Sun, 16 Mar 2025 00:29:22 -0700 Subject: [PATCH 06/34] [V1] [Spec Decode] Fix ngram tests (#14878) --- tests/v1/spec_decode/test_ngram.py | 55 ++++++++++++++++-------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index ec663c84d0d2a..2c2e125ade48c 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -1,32 +1,37 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest -from vllm.v1.spec_decode.ngram_proposer import NgramProposer -from vllm.v1.utils import ConstantList +import numpy as np + +from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp, + _kmp_lps_array) -@pytest.fixture -def proposer(): - return NgramProposer() +def test_kmp_lps_array(): + np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])), + np.array([0, 1, 2])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])), + np.array([0, 0, 0, 0])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])), + np.array([0, 0, 1, 2, 0])) -def test_kmp_lps_array(proposer): - assert proposer._kmp_lps_array([]) == [] - assert proposer._kmp_lps_array([1]) == [0] - assert proposer._kmp_lps_array([1, 1, 1]) == [0, 1, 2] - assert proposer._kmp_lps_array([1, 2, 3, 4]) == [0, 0, 0, 0] - assert proposer._kmp_lps_array([1, 2, 1, 2, 3]) == [0, 0, 1, 2, 0] - - -def test_find_subarray_kmp(proposer): - X = ConstantList([1, 2, 3, 4, 1, 2, 3, 5, 6]) - assert proposer._find_subarray_kmp(X, 2, 2) is None - X = ConstantList([1, 2, 3, 4, 1, 2, 3]) - assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2] - assert proposer._find_subarray_kmp(X, 2, 2) == [4, 1] - assert proposer._find_subarray_kmp(X, 1, 3) == [4, 1, 2] - assert proposer._find_subarray_kmp(X, 1, 2) == [4, 1] - X = ConstantList([1, 3, 6, 2, 3, 4, 1, 2, 3]) - assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2] +def test_find_subarray_kmp(): + X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6]) + assert _find_subarray_kmp(X, 2, 2) is None + X = np.array([1, 2, 3, 4, 1, 2, 3]) + np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), + np.array([4, 1, 2])) + np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4, + 1])) + np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), + np.array([4, 1, 2])) + np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4, + 1])) + X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3]) + np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), + np.array([4, 1, 2])) # Return on the first match - assert proposer._find_subarray_kmp(X, 1, 3) == [6, 2, 3] \ No newline at end of file + np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), + np.array([6, 2, 3])) From d30aa7e9e6afd6147865c8c9fae8cd21f5ddce3d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 16 Mar 2025 10:44:19 -0400 Subject: [PATCH 07/34] [Bugfix] Limit profiling run sequence length by max_model_len (#14785) Signed-off-by: Kyle Sayers --- vllm/inputs/registry.py | 5 +++++ vllm/worker/enc_dec_model_runner.py | 1 + vllm/worker/model_runner.py | 1 + vllm/worker/openvino_model_runner.py | 1 + vllm/worker/xpu_model_runner.py | 1 + 5 files changed, 9 insertions(+) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index b6ceb5fb82d70..24980833864b0 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -330,6 +330,11 @@ class InputRegistry: from vllm.multimodal import MultiModalKwargs from vllm.multimodal.profiling import MultiModalProfiler + if seq_len > model_config.max_model_len: + raise AssertionError( + f"Profiling attempted with sequence length ({seq_len}) " + f"greater than model length ({model_config.max_model_len})") + if mm_registry.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) processor = mm_registry.create_processor(model_config, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 5f39f2fa4947c..f34597ac05db4 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -281,6 +281,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len decoder_dummy_data = self.input_registry \ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 473bd901b5b23..3181483fe8390 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1302,6 +1302,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index aa1d2cbb2df29..9b484a9f543fe 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -148,6 +148,7 @@ class OpenVINOModelRunner(ModelRunnerBase): seq_len = min( seq_data.get_len(), computed_len + seq_group_metadata.token_chunk_size, + self.model_config.max_model_len, ) if is_prompt: tokens = seq_data.get_token_ids()[computed_len:seq_len] diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 39957e661c474..2103260d8900c 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -466,6 +466,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ From e53b1350f289d65011d9251fd826646c169018df Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 00:05:40 +0800 Subject: [PATCH 08/34] [Bugfix] Explicitly disable Phi-4-multimodal in V1 (#14889) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/phi4mm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 7250aaba557eb..3d4505d556e2c 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -33,7 +33,7 @@ from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .idefics2_vision_model import Idefics2VisionTransformer -from .interfaces import SupportsLoRA, SupportsMultiModal +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only from .phi4mm_audio import AudioEmbedding from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -1433,7 +1433,8 @@ def cat_with_pad(tensors, dim, padding_value=0): "image", get_max_phi4mm_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm) @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm) -class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): +class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal, + SupportsV0Only): """ Implements the Phi-4-multimodal-instruct model in vLLM. """ From f6137adbcbbdea8b5023a66480de921b558bef83 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 00:13:46 +0800 Subject: [PATCH 09/34] Revert "[Bugfix] Limit profiling run sequence length by max_model_len (#14785) (#14892) Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 5 ----- vllm/worker/enc_dec_model_runner.py | 1 - vllm/worker/model_runner.py | 1 - vllm/worker/openvino_model_runner.py | 1 - vllm/worker/xpu_model_runner.py | 1 - 5 files changed, 9 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 24980833864b0..b6ceb5fb82d70 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -330,11 +330,6 @@ class InputRegistry: from vllm.multimodal import MultiModalKwargs from vllm.multimodal.profiling import MultiModalProfiler - if seq_len > model_config.max_model_len: - raise AssertionError( - f"Profiling attempted with sequence length ({seq_len}) " - f"greater than model length ({model_config.max_model_len})") - if mm_registry.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) processor = mm_registry.create_processor(model_config, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index f34597ac05db4..5f39f2fa4947c 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -281,7 +281,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len decoder_dummy_data = self.input_registry \ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3181483fe8390..473bd901b5b23 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1302,7 +1302,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 9b484a9f543fe..aa1d2cbb2df29 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -148,7 +148,6 @@ class OpenVINOModelRunner(ModelRunnerBase): seq_len = min( seq_data.get_len(), computed_len + seq_group_metadata.token_chunk_size, - self.model_config.max_model_len, ) if is_prompt: tokens = seq_data.get_token_ids()[computed_len:seq_len] diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 2103260d8900c..39957e661c474 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -466,7 +466,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ From fc1f67715d95f24885288b75c736cc1fc1be0103 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 16 Mar 2025 14:53:34 -0700 Subject: [PATCH 10/34] [BugFix][V1] Fix overhead related to bad_words sampling when not in use (#14894) Signed-off-by: Nick Hill --- tests/v1/worker/test_gpu_input_batch.py | 5 +++-- vllm/sampling_params.py | 7 ++++--- vllm/v1/worker/gpu_input_batch.py | 5 +++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 192ddefe102d2..2486c26c6071a 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -124,8 +124,9 @@ def _construct_expected_sampling_metadata( if req.sampling_params.allowed_token_ids: allowed_token_ids_mask[index_in_input_batch][ req.sampling_params.allowed_token_ids] = True - bad_words_token_ids[ - index_in_input_batch] = req.sampling_params.bad_words_token_ids + if req.sampling_params.bad_words_token_ids: + bad_words_token_ids[ + index_in_input_batch] = req.sampling_params.bad_words_token_ids return SamplingMetadata( temperature=torch.tensor(temperature, dtype=torch.float, diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index b0a5777cc8d56..9b474a37b96b6 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -235,7 +235,7 @@ class SamplingParams( # Fields used for bad words bad_words: Optional[list[str]] = None - _bad_words_token_ids: list[list[int]] = msgspec.field(default_factory=list) + _bad_words_token_ids: Optional[list[list[int]]] = None @staticmethod def from_optional( @@ -464,8 +464,9 @@ class SamplingParams( self.stop_token_ids = list(eos_ids) def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None: - if self.bad_words is None: + if not self.bad_words: return + self._bad_words_token_ids = [] for bad_word in self.bad_words: # To prohibit words both at the beginning # and in the middle of text @@ -516,7 +517,7 @@ class SamplingParams( return self._all_stop_token_ids @property - def bad_words_token_ids(self) -> list[list[int]]: + def bad_words_token_ids(self) -> Optional[list[list[int]]]: # For internal use only. Backward compatibility not guaranteed return self._bad_words_token_ids diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 9707cb5774cd0..55d5429a8935d 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -324,8 +324,9 @@ class InputBatch: self.allowed_token_ids_mask_cpu_tensor[req_index][ sampling_params.allowed_token_ids] = False - self.bad_words_token_ids[ - req_index] = sampling_params.bad_words_token_ids + if sampling_params.bad_words_token_ids: + self.bad_words_token_ids[ + req_index] = sampling_params.bad_words_token_ids # Add request lora ID if request.lora_request: From 31060b2757fb19ec67894b7c441383ceec9f1272 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 16 Mar 2025 14:53:53 -0700 Subject: [PATCH 11/34] [V1][BugFix] Detect interleaved sliding window attention (#14896) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c2a976108e4d4..8dd7521ff49a2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -82,8 +82,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] - self.is_multimodal_model = model_config.is_multimodal_model + # NOTE(woosuk): sliding_window is None for models with interleaved + # attention. Use interleaved_sliding_window instead. self.sliding_window = model_config.get_sliding_window() + self.interleaved_sliding_window = getattr( + model_config.hf_text_config, "interleaved_sliding_window", None) + self.window_size = (self.sliding_window + or self.interleaved_sliding_window) + + self.is_multimodal_model = model_config.is_multimodal_model self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) @@ -674,7 +681,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_query_heads=self.num_query_heads, num_kv_heads=self.num_kv_heads, use_alibi=False, # FIXME - use_sliding_window=self.sliding_window is not None, + use_sliding_window=self.window_size is not None, num_sms=self.num_sms, ) return common_prefix_len if use_cascade else 0 From b9b5bdfc7d5cd0f8610a4de7a79327d10a09dfab Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:46:42 -0700 Subject: [PATCH 12/34] [Misc] Catching Ray Compiled Graph PP test failures for V1 (#14847) --- tests/distributed/test_pipeline_parallel.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 05b6ba40506a2..4d3306509c8f2 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -350,6 +350,10 @@ def _compare_tp( else: pp_env = None + tp_env = { + "VLLM_USE_V1": vllm_major_version, + } + pp_args = [ *common_args, "--pipeline-parallel-size", @@ -374,14 +378,20 @@ def _compare_tp( ] try: - compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method) + compare_two_settings(model_id, + pp_args, + tp_args, + pp_env, + tp_env, + method=method) except Exception: - if pp_env is None: - raise - else: - # Ray Compiled Graph tests are flaky, + testing_ray_compiled_graph = pp_env is not None + if testing_ray_compiled_graph and vllm_major_version == "0": + # Ray Compiled Graph tests are flaky for V0, # so we don't want to fail the test logger.exception("Ray Compiled Graph tests failed") + else: + raise @pytest.mark.parametrize( From 90df7f23aadad4aafc509fa950bd9b967a996e84 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Mon, 17 Mar 2025 03:10:04 +0400 Subject: [PATCH 13/34] [Doc] Add guidance for using `ccache` with `pip install -e .` in doc (#14901) --- docs/source/getting_started/installation/gpu/cuda.inc.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 7e3b884c2ab1e..d3e375aec10cb 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. +When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built. + [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ::: From aecc780dba30db6b503754926564642374cb2c2e Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sun, 16 Mar 2025 20:56:16 -0400 Subject: [PATCH 14/34] [V1] Enable Entrypoints Tests (#14903) --- .buildkite/test-pipeline.yaml | 1 + tests/v1/entrypoints/llm/test_struct_output_generate.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 93ac8a29c676c..a6616d7b41480 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -198,6 +198,7 @@ steps: commands: # split the test to avoid interference - pytest -v -s v1/core + - pytest -v -s v1/entrypoints - pytest -v -s v1/engine - pytest -v -s v1/sample - pytest -v -s v1/worker diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index b4eb475c23baa..98983fa05b83f 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -18,6 +18,9 @@ MODELS_TO_TEST = [ "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410" ] +# Undo after https://github.com/vllm-project/vllm/pull/14868 +pytest.skip(allow_module_level=True) + @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", From bb3aeddfaf338a9bbac10e3c75027b7f8c5c08e0 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sun, 16 Mar 2025 22:06:43 -0400 Subject: [PATCH 15/34] [CI] Nightly Tests (#14898) Signed-off-by: rshaw@neuralmagic.com Signed-off-by: rshaw@neuralmagic.com Co-authored-by: rshaw@neuralmagic.com --- .../models/decoder_only/language/test_mistral.py | 1 + tests/tool_use/utils.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 7e1337b7d4876..4c2055361d445 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -201,6 +201,7 @@ def test_models( ) +@pytest.mark.skip("RE-ENABLE: test is currently failing on main.") @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index aad37eb9b8f3a..df117b96cd07b 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -46,6 +46,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "NousResearch/Hermes-3-Llama-3.1-8B", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "hermes", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja") ], @@ -60,6 +61,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "llama3_json", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja") ], @@ -70,6 +72,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "meta-llama/Llama-3.2-3B-Instruct", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "llama3_json", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja") ], @@ -80,6 +83,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "mistralai/Mistral-7B-Instruct-v0.3", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "mistral", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"), "--ignore-patterns=\"consolidated.safetensors\"" @@ -111,22 +115,28 @@ CONFIGS: dict[str, ServerConfig] = { "model": "ibm-granite/granite-3.0-8b-instruct", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "granite", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_granite.jinja") ], }, "granite-3.1-8b": { - "model": "ibm-granite/granite-3.1-8b-instruct", + "model": + "ibm-granite/granite-3.1-8b-instruct", "arguments": [ + "--enforce-eager", + "--no-enable-prefix-caching", "--tool-call-parser", "granite", ], - "supports_parallel": True, + "supports_parallel": + True, }, "internlm": { "model": "internlm/internlm2_5-7b-chat", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "internlm", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_internlm2_tool.jinja"), @@ -139,6 +149,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "Team-ACE/ToolACE-8B", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "pythonic", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja") ], From 8a5a9b70d702feb17e79691870c638b0f1e71192 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 10:38:15 +0800 Subject: [PATCH 16/34] [CI/Build] Update defaults for test reproducibility (#14893) Signed-off-by: DarkLight1337 --- tests/conftest.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4716ca2e315b7..41c0e62ce14f3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -681,6 +681,17 @@ def hf_runner(): class VllmRunner: + """ + The default value of some arguments have been modified from + :class:`~vllm.LLM` as follows: + - `trust_remote_code`: Set to `True` instead of `False` for convenience. + - `seed`: Set to `0` instead of `None` for test reproducibility. + - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage. + - `block_size`: Set to `16` instead of `None` to reduce memory usage. + - `enable_chunked_prefill`: Set to `False` instead of `None` for + test reproducibility. + - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph. + """ def __init__( self, @@ -688,6 +699,8 @@ class VllmRunner: task: TaskOption = "auto", tokenizer_name: Optional[str] = None, tokenizer_mode: str = "auto", + trust_remote_code: bool = True, + seed: Optional[int] = 0, # Use smaller max model length, otherwise bigger model cannot run due # to kv cache size limit. max_model_len: int = 1024, @@ -695,7 +708,7 @@ class VllmRunner: disable_log_stats: bool = True, tensor_parallel_size: int = 1, block_size: int = 16, - enable_chunked_prefill: bool = False, + enable_chunked_prefill: Optional[bool] = False, swap_space: int = 4, enforce_eager: Optional[bool] = False, **kwargs, @@ -705,8 +718,9 @@ class VllmRunner: task=task, tokenizer=tokenizer_name, tokenizer_mode=tokenizer_mode, - trust_remote_code=True, + trust_remote_code=trust_remote_code, dtype=dtype, + seed=seed, swap_space=swap_space, enforce_eager=enforce_eager, disable_log_stats=disable_log_stats, From faa02757307583f2c5557ff23cb41f1db4f1f29c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 16 Mar 2025 20:19:30 -0700 Subject: [PATCH 17/34] [V1] Optimize the overhead of rewinding (#14905) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8dd7521ff49a2..4059d5b17b71b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1032,17 +1032,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. - for i, req_id in enumerate(self.input_batch.req_ids): + for i, generator in self.input_batch.generators.items(): + req_id = self.input_batch.req_ids[i] req_state = self.requests[req_id] seq_len = (req_state.num_computed_tokens + scheduler_output.num_scheduled_tokens[req_id]) if seq_len < req_state.num_tokens: - # Ignore the sampled token. + # Ignore the sampled token for partial prefills. # Rewind the generator state as if the token was not sampled. - generator = self.input_batch.generators.get(i) - if generator is not None: - # This relies on cuda-specific torch-internal impl details - generator.set_offset(generator.get_offset() - 4) + # This relies on cuda-specific torch-internal impl details + generator.set_offset(generator.get_offset() - 4) # NOTE: GPU -> CPU Sync happens here. # Move as many CPU operations as possible before this sync point. From 7f6c5ee06c4861ae1310f4ea5caaa2104efb4d22 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 16 Mar 2025 20:20:15 -0700 Subject: [PATCH 18/34] [V1][Minor] Add __repr__ to ConstantList (#14907) Signed-off-by: Woosuk Kwon --- vllm/v1/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 8e1fb18cca05b..6c01ed3de52d7 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -86,6 +86,9 @@ class ConstantList(Generic[T], Sequence): def __len__(self): return len(self._x) + def __repr__(self): + return f"ConstantList({self._x})" + class BackgroundProcHandle: """ From 1e799b7ec1b1c61952d2ae24c85ecf3fcb0f6de3 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 16 Mar 2025 23:35:37 -0400 Subject: [PATCH 19/34] [BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910) --- vllm/platforms/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 3897584307e91..8a53337ebc087 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -152,7 +152,7 @@ class CudaPlatformBase(Platform): # here use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \ or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA") - from vllm.attention.backends.flashmla import is_flashmla_supported + from vllm.attention.ops.flashmla import is_flashmla_supported if use_flashmla and is_flashmla_supported()[0] \ and cache_config.block_size != 64: cache_config.block_size = 64 From a73e183e36a818ea95f442ae1751bc66cf4f135d Mon Sep 17 00:00:00 2001 From: Sibi <85477603+t-sibiraj@users.noreply.github.com> Date: Mon, 17 Mar 2025 11:35:57 +0800 Subject: [PATCH 20/34] [Misc] Replace os environ to monkeypatch in test suite (#14516) Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham Co-authored-by: Cyrus Leung Co-authored-by: Aaron Pham --- .buildkite/test-pipeline.yaml | 2 +- .../test_basic_correctness.py | 105 +++--- .../basic_correctness/test_chunked_prefill.py | 168 +++++----- tests/basic_correctness/test_cumem.py | 62 ++-- tests/compile/test_basic_correctness.py | 207 ++++++------ tests/compile/test_full_graph.py | 115 ++++++- tests/compile/utils.py | 93 ------ tests/conftest.py | 2 +- tests/distributed/test_comm_ops.py | 85 +++-- tests/distributed/test_custom_all_reduce.py | 173 +++++----- tests/distributed/test_pipeline_partition.py | 60 ++-- tests/distributed/test_pp_cudagraph.py | 38 ++- tests/entrypoints/llm/test_accuracy.py | 4 +- .../offline_mode/test_offline_mode.py | 49 +-- .../openai/correctness/test_lmeval.py | 5 +- tests/kernels/test_attention_selector.py | 129 +++++--- tests/kernels/test_awq.py | 60 ++-- tests/kernels/test_rocm_attention_selector.py | 18 +- tests/kernels/utils.py | 64 ++-- .../{disagg_test.py => test_disagg.py} | 0 .../{module_test.py => test_module.py} | 0 .../models/decoder_only/language/test_fp8.py | 120 +++---- .../models/embedding/language/test_gritlm.py | 96 +++--- tests/models/test_oot_registration.py | 130 ++++---- tests/mq_llm_engine/test_error_handling.py | 31 +- .../multi_step/test_correctness_async_llm.py | 202 ++++++------ tests/multi_step/test_correctness_llm.py | 299 ++++++++--------- tests/neuron/1_core/test_block_table.py | 80 ++--- tests/neuron/1_core/test_prefix_prefill.py | 306 +++++++++--------- tests/plugins_tests/test_platform_plugins.py | 13 +- tests/plugins_tests/test_scheduler_plugins.py | 62 ++-- tests/prefix_caching/test_prefix_caching.py | 111 ++++--- tests/test_regression.py | 16 +- tests/test_utils.py | 63 ++-- tests/tpu/test_custom_dispatcher.py | 25 +- tests/tracing/test_tracing.py | 277 ++++++++-------- tests/utils.py | 11 +- tests/v1/e2e/test_ngram_spec_decode.py | 11 +- tests/v1/engine/test_async_llm.py | 11 +- tests/v1/engine/test_engine_core.py | 10 +- tests/v1/engine/test_engine_core_client.py | 5 +- tests/v1/sample/test_logprobs.py | 224 +++++++------ tests/v1/tpu/test_basic.py | 16 +- 43 files changed, 1900 insertions(+), 1658 deletions(-) delete mode 100644 tests/compile/utils.py rename tests/kv_transfer/{disagg_test.py => test_disagg.py} (100%) rename tests/kv_transfer/{module_test.py => test_module.py} (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a6616d7b41480..f85572e7c234c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -522,7 +522,7 @@ steps: # TODO: investigate and fix # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0cb3b739b7245..1458f0893a93c 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -47,6 +47,7 @@ def test_vllm_gc_ed(): @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) def test_models( + monkeypatch: pytest.MonkeyPatch, hf_runner, model: str, backend: str, @@ -63,31 +64,33 @@ def test_models( pytest.skip( f"{backend} does not support gemma2 with full context length.") - os.environ["VLLM_ATTENTION_BACKEND"] = backend + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", backend) - # 5042 tokens for gemma2 - # gemma2 has alternating sliding window size of 4096 - # we need a prompt with more than 4096 tokens to test the sliding window - prompt = "The following numbers of the sequence " + ", ".join( - str(i) for i in range(1024)) + " are:" - example_prompts = [prompt] + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with VllmRunner(model, - max_model_len=8192, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -104,6 +107,7 @@ def test_models( ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ]) def test_models_distributed( + monkeypatch: pytest.MonkeyPatch, hf_runner, vllm_runner, example_prompts, @@ -116,34 +120,41 @@ def test_models_distributed( if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") - if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa - # test Ray Compiled Graph - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + with monkeypatch.context() as monkeypatch_context: + if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa + # test Ray Compiled Graph + monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") + monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") - if attention_backend: - os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend + if attention_backend: + monkeypatch_context.setenv( + "VLLM_ATTENTION_BACKEND", + attention_backend, + ) - dtype = "half" - max_tokens = 5 + dtype = "half" + max_tokens = 5 - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method + # (the default method). + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index be007de321c8a..06c9e25ed8dd8 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -7,16 +7,22 @@ prefill requests are chunked. Run `pytest tests/models/test_chunked_prefill.py`. """ -import os + +from __future__ import annotations + +from typing import TYPE_CHECKING import pytest -from tests.kernels.utils import override_backend_env_variable from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close, check_outputs_equal from ..utils import multi_gpu_test +if TYPE_CHECKING: + from .conftest import HfRunner, VllmRunner + MODELS = [ "facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct", @@ -24,12 +30,14 @@ MODELS = [ @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ Since this module is V0 only, set VLLM_USE_V1=0 for all tests in the file. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield @pytest.mark.parametrize("model", MODELS) @@ -42,8 +50,8 @@ def use_v0_only(monkeypatch): @pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, dtype: str, @@ -52,37 +60,39 @@ def test_models( enforce_eager: bool, tensor_parallel_size: int, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Checks exact match decode between huggingface model and vllm runner with chunked prefill. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size + max_num_seqs = chunked_prefill_token_size + max_num_batched_tokens = chunked_prefill_token_size - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with vllm_runner( + model, + dtype=dtype, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=True, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -90,57 +100,61 @@ def test_models( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models_distributed( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, distributed_executor_backend: str, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) + if (model == "meta-llama/Llama-3.2-1B-Instruct" + and distributed_executor_backend == "ray"): + # test Ray Compiled Graph + m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") + m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") - if (model == "meta-llama/Llama-3.2-1B-Instruct" - and distributed_executor_backend == "ray"): - # test Ray Compiled Graph - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + dtype = "half" + max_tokens = 5 + chunked_prefill_token_size = 16 - dtype = "half" - max_tokens = 5 - chunked_prefill_token_size = 16 + # Add a chunked prefill config. + max_num_seqs = min(chunked_prefill_token_size, 256) + assert chunked_prefill_token_size != -1 + enable_chunked_prefill = True + max_num_batched_tokens = chunked_prefill_token_size - # Add a chunked prefill config. - max_num_seqs = min(chunked_prefill_token_size, 256) - assert chunked_prefill_token_size != -1 - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with + # fork method (the default method). - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + max_num_seqs=max_num_seqs, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy( + example_prompts, + max_tokens, + ) - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - max_num_seqs=max_num_seqs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize( @@ -158,7 +172,7 @@ def test_models_distributed( # the async postprocessor @pytest.mark.parametrize("disable_async_output_proc", [True]) def test_models_with_fp8_kv_cache( - vllm_runner, + vllm_runner: VllmRunner, example_prompts, kv_cache_dtype: str, model: str, @@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache( @pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("dtype", ["half"]) def test_with_prefix_caching( - vllm_runner, + vllm_runner: VllmRunner, max_tokens: int, enforce_eager: bool, chunk_size: int, @@ -254,8 +268,10 @@ def test_with_prefix_caching( ) as vllm_model: outputs[enable] = [] for prompt in full_prompts: - outputs[enable] += vllm_model.generate_greedy([prompt], - max_tokens) + outputs[enable] += vllm_model.generate_greedy( + [prompt], + max_tokens, + ) check_outputs_equal( outputs_0_lst=outputs[False], @@ -274,8 +290,8 @@ def test_with_prefix_caching( @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") def test_models_cpu( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, dtype: str, @@ -283,7 +299,7 @@ def test_models_cpu( chunked_prefill_token_size: int, enforce_eager: bool, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: test_models( hf_runner, @@ -307,7 +323,7 @@ def test_models_cpu( @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") def test_with_prefix_caching_cpu( - vllm_runner, + vllm_runner: VllmRunner, max_tokens: int, enforce_eager: bool, chunk_size: int, diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index ba81f2bb79d11..f5ee469fb00a9 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -123,40 +123,38 @@ def test_cumem_with_cudagraph(): # sleep mode with pytorch checkpoint ("facebook/opt-125m", False), ]) -def test_end_to_end(model: str, use_v1: bool): - import os - os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" - free, total = torch.cuda.mem_get_info() - used_bytes_baseline = total - free # in case other process is running - llm = LLM(model, enable_sleep_mode=True) - prompt = "How are you?" - sampling_params = SamplingParams(temperature=0, max_tokens=10) - output = llm.generate(prompt, sampling_params) +def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM(model, enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) - # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, - # which is difficult to measure in the test. therefore, we only - # test sleep level 1 here. - llm.sleep(level=1) + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) - free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() - used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline - # now the memory usage is mostly cudagraph memory pool, - # and it should be less than the model weights (1B model, 2GiB weights) + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) - # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) - # is captured but cannot be releasesd from PyTorch due to a known bug, - # therefore high memory usage after `llm.sleep` is called is expected. - # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode - # in V1. - if use_v1: - assert used_bytes < 7 * GiB_bytes - else: - assert used_bytes < 2 * GiB_bytes + # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) + # is captured but cannot be releasesd from PyTorch due to a known bug, + # therefore high memory usage after `llm.sleep` is called is expected. + # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode + # in V1. + if use_v1: + assert used_bytes < 7 * GiB_bytes + else: + assert used_bytes < 2 * GiB_bytes - llm.wake_up() - output2 = llm.generate(prompt, sampling_params) + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) - # cmp output - assert output[0].outputs[0].text == output2[0].outputs[0].text - - del os.environ["VLLM_USE_V1"] + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 48323b21a8c42..b639fd719ca0a 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import dataclasses -from typing import Optional import pytest @@ -22,75 +22,76 @@ class TestSetting: fullgraph: bool -# representative settings for testing -test_settings = [ - # basic llama model - TestSetting( - model="meta-llama/Llama-3.2-1B-Instruct", - model_args=[], - pp_size=2, - tp_size=2, - attn_backend="FLASHINFER", - method="generate", - fullgraph=True, - ), - # llama model with quantization - TestSetting( - model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - model_args=["--quantization", "gptq"], - pp_size=1, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate", - fullgraph=True, - ), - # MoE model - TestSetting( - model="ibm/PowerMoE-3b", - model_args=[], - pp_size=1, - tp_size=2, - attn_backend="FLASH_ATTN", - method="generate", - fullgraph=True, - ), - # embedding model - TestSetting( - model="BAAI/bge-multilingual-gemma2", - model_args=["--task", "embed"], - pp_size=1, - tp_size=1, - attn_backend="FLASH_ATTN", - method="encode", - fullgraph=True, - ), - # encoder-based embedding model (BERT) - TestSetting( - model="BAAI/bge-base-en-v1.5", - model_args=["--task", "embed"], - pp_size=1, - tp_size=1, - attn_backend="XFORMERS", - method="encode", - fullgraph=True, - ), - # vision language model - TestSetting( - model="microsoft/Phi-3.5-vision-instruct", - model_args=["--trust-remote-code", "--max-model-len", "2048"], - pp_size=2, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate_with_image", - fullgraph=False, - ), -] - - # we cannot afford testing the full Catesian product # of all models and all levels -@pytest.mark.parametrize("test_setting", test_settings) -def test_compile_correctness(test_setting: TestSetting): +@pytest.mark.parametrize( + "test_setting", + [ + # basic llama model + TestSetting( + model="meta-llama/Llama-3.2-1B-Instruct", + model_args=[], + pp_size=2, + tp_size=2, + attn_backend="FLASHINFER", + method="generate", + fullgraph=True, + ), + # llama model with quantization + TestSetting( + model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + model_args=["--quantization", "gptq"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # MoE model + TestSetting( + model="ibm/PowerMoE-3b", + model_args=[], + pp_size=1, + tp_size=2, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # embedding model + TestSetting( + model="BAAI/bge-multilingual-gemma2", + model_args=["--task", "embed"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="encode", + fullgraph=True, + ), + # encoder-based embedding model (BERT) + TestSetting( + model="BAAI/bge-base-en-v1.5", + model_args=["--task", "embed"], + pp_size=1, + tp_size=1, + attn_backend="XFORMERS", + method="encode", + fullgraph=True, + ), + # vision language model + TestSetting( + model="microsoft/Phi-3.5-vision-instruct", + model_args=["--trust-remote-code", "--max-model-len", "2048"], + pp_size=2, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate_with_image", + fullgraph=False, + ), + ]) +def test_compile_correctness( + monkeypatch: pytest.MonkeyPatch, + test_setting: TestSetting, +): # this test is run under multiple suits, with different GPUs. # make sure we only run the test with correct CUDA devices. # don't use "<", as it will duplicate the tests. @@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting): fullgraph = test_setting.fullgraph if cuda_device_count_stateless() != pp_size * tp_size: pytest.skip("Not correct CUDA devices for the test.") - import os - os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend - final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ - ["-tp", str(tp_size)] - all_args: list[list[str]] = [] - all_envs: list[Optional[dict[str, str]]] = [] + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + final_args = [ + "--enforce-eager", *model_args, "-pp", + str(pp_size), "-tp", + str(tp_size) + ] - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.PIECEWISE, - ]: - all_args.append(final_args + [f"-O{level}"]) - all_envs.append({}) + all_args: list[list[str]] = [] + all_envs: list[dict[str, str] | None] = [] - # inductor will change the output, so we only compare if the output - # is close, not exactly the same. - compare_all_settings( - model, - all_args, - all_envs, - method=method if method != "generate" else "generate_close") - all_envs.clear() - all_args.clear() + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.PIECEWISE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - ]: - all_args.append(final_args + [f"-O{level}"]) - all_envs.append({}) - if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: - # "DYNAMO_ONCE" will always use fullgraph - all_envs[-1][ - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close") + all_envs.clear() + all_args.clear() - compare_all_settings(model, all_args * 3, all_envs, method=method) + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.DYNAMO_AS_IS, + CompilationLevel.DYNAMO_ONCE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) + if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: + # "DYNAMO_ONCE" will always use fullgraph + all_envs[-1][ + "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + + compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 6e83fa36881e4..cf463f3e75254 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,22 +1,115 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest +from __future__ import annotations +from typing import Any + +import pytest +import torch + +from tests.quantization.utils import is_quant_method_supported +from vllm import LLM, SamplingParams from vllm.config import CompilationLevel +from vllm.platforms import current_platform from ..utils import fork_new_process_for_each_test -from .utils import TEST_MODELS, check_full_graph_support -@pytest.mark.parametrize("model_info", TEST_MODELS) +@pytest.fixture(params=None, name="model_info") +def models_list_fixture(request): + TEST_MODELS: list[tuple[str, dict[str, Any]]] = [ + ("facebook/opt-125m", {}), + ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { + "dtype": torch.float16, + "quantization": "compressed-tensors" + }), + ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { + "dtype": torch.float16, + "quantization": "compressed-tensors" + }), + ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { + "quantization": "compressed-tensors" + }), + ("meta-llama/Llama-3.2-1B-Instruct", {}), + ] + + if is_quant_method_supported("aqlm"): + TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { + "quantization": "aqlm" + })) + + # TODO: figure out why this fails. + if False and is_quant_method_supported("gguf"): # noqa: SIM223 + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { + "quantization": "gguf" + })) + + if is_quant_method_supported("gptq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { + "quantization": "gptq" + })) + + if is_quant_method_supported("gptq_marlin"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { + "quantization": "gptq_marlin" + })) + + if is_quant_method_supported("gptq_marlin_24"): + TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { + "quantization": "gptq_marlin_24" + })) + + if is_quant_method_supported("marlin"): + TEST_MODELS.append( + ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { + "quantization": "marlin" + })) + + if not current_platform.is_rocm() and is_quant_method_supported("awq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { + "quantization": "AWQ" + })) + + return TEST_MODELS + + @pytest.mark.parametrize( "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) + [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], +) +@pytest.mark.parametrize("model_info", "", indirect=True) @fork_new_process_for_each_test -def test_full_graph(model_info, optimization_level): - model = model_info[0] - model_kwargs = model_info[1] - check_full_graph_support(model, - model_kwargs, - optimization_level, - tp_size=1) +def test_full_graph( + monkeypatch: pytest.MonkeyPatch, + model_info: tuple[str, dict[str, Any]], + optimization_level: int, +): + model, model_kwargs = model_info + + with monkeypatch.context() as m: + # make sure these models can be captured in full graph mode + m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") + print(f"MODEL={model}") + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0) + llm = LLM( + model=model, + enforce_eager=True, + tensor_parallel_size=1, + disable_custom_all_reduce=True, + compilation_config=optimization_level, + **model_kwargs, + ) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/compile/utils.py b/tests/compile/utils.py deleted file mode 100644 index fb8270c26b1b0..0000000000000 --- a/tests/compile/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import os - -import torch - -from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams -from vllm.platforms import current_platform - -TEST_MODELS = [ - ("facebook/opt-125m", {}), - ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), - ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), - ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { - "quantization": "compressed-tensors" - }), - ("meta-llama/Llama-3.2-1B-Instruct", {}), -] - -if is_quant_method_supported("aqlm"): - TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { - "quantization": "aqlm" - })) - -# TODO: figure out why this fails. -if False and is_quant_method_supported("gguf"): # noqa: SIM223 - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { - "quantization": "gguf" - })) - -if is_quant_method_supported("gptq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { - "quantization": "gptq" - })) - -if is_quant_method_supported("gptq_marlin"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { - "quantization": "gptq_marlin" - })) - -if is_quant_method_supported("gptq_marlin_24"): - TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { - "quantization": "gptq_marlin_24" - })) - -if is_quant_method_supported("marlin"): - TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { - "quantization": "marlin" - })) - -if not current_platform.is_rocm() and is_quant_method_supported("awq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { - "quantization": "AWQ" - })) - - -def check_full_graph_support(model, - model_kwargs, - optimization_level, - tp_size=1): - # make sure these models can be captured in full graph mode - os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" - - print(f"MODEL={model}") - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=model, - enforce_eager=True, - tensor_parallel_size=tp_size, - disable_custom_all_reduce=True, - compilation_config=optimization_level, - **model_kwargs) - - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/conftest.py b/tests/conftest.py index 41c0e62ce14f3..30e5ca2eb137a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items): skip_optional = pytest.mark.skip(reason="need --optional option to run") for item in items: if "optional" in item.keywords: - item.add_marker(skip_optional) + item.add_marker(skip_optional) \ No newline at end of file diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 7b0346b8ab50f..ac6d6aae30063 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -3,7 +3,10 @@ Run `pytest tests/distributed/test_comm_ops.py`. """ -import os + +from __future__ import annotations + +from typing import Any, Callable import pytest import ray @@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel @ray.remote(num_gpus=1, max_calls=1) -def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def all_reduce_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def all_gather_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def broadcast_tensor_dict_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) +def send_recv_tensor_dict_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) +def send_recv_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker ]) -def test_multi_process_tensor_parallel(tp_size, test_target): - multi_process_parallel(tp_size, 1, test_target) +def test_multi_process_tensor_parallel( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + test_target: Callable[..., Any], +): + multi_process_parallel(monkeypatch, tp_size, 1, test_target) @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target): @pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize( "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) -def test_multi_process_pipeline_parallel(pp_size, test_target): - multi_process_parallel(1, pp_size, test_target) +def test_multi_process_pipeline_parallel( + monkeypatch: pytest.MonkeyPatch, + pp_size: int, + test_target: Callable[..., Any], +): + multi_process_parallel(monkeypatch, 1, pp_size, test_target) @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target): broadcast_tensor_dict_test_worker ]) def test_multi_process_tensor_parallel_pipeline_parallel( - tp_size, pp_size, test_target): - multi_process_parallel(tp_size, pp_size, test_target) + tp_size: int, + pp_size: int, + test_target: Callable[..., Any], + monkeypatch: pytest.MonkeyPatch, +): + multi_process_parallel(monkeypatch, tp_size, pp_size, test_target) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 4928690bebb07..bfa7d06c4d075 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os import random import pytest @@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes): @ray.remote(num_gpus=1, max_calls=1) -def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(tp_size, pp_size, rank, - distributed_init_port) - ensure_model_parallel_initialized(tp_size, pp_size) - group = get_tensor_model_parallel_group().device_group +def graph_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pp_size, + rank, + distributed_init_port, +): + with monkeypatch.context() as m: + m.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) + ensure_model_parallel_initialized(tp_size, pp_size) + group = get_tensor_model_parallel_group().device_group - # A small all_reduce for warmup. - # this is needed because device communicators might be created lazily - # (e.g. NCCL). This will ensure that the communicator is initialized - # before any communication happens, so that this group can be used for - # graph capture immediately. - data = torch.zeros(1) - data = data.to(device=device) - torch.distributed.all_reduce(data, group=group) - torch.cuda.synchronize() - del data + # A small all_reduce for warmup. + # this is needed because device communicators might be created lazily + # (e.g. NCCL). This will ensure that the communicator is initialized + # before any communication happens, so that this group can be used for + # graph capture immediately. + data = torch.zeros(1) + data = data.to(device=device) + torch.distributed.all_reduce(data, group=group) + torch.cuda.synchronize() + del data - # we use the first group to communicate once - # and the second group to communicate twice - # and so on - # this is used to demonstrate that each group can - # communicate independently - num_communication = rank // tp_size + 1 + # we use the first group to communicate once + # and the second group to communicate twice + # and so on + # this is used to demonstrate that each group can + # communicate independently + num_communication = rank // tp_size + 1 - for sz in test_sizes: - for dtype in [torch.float32, torch.float16, torch.bfloat16]: - with graph_capture(device=device) as graph_capture_context: - # use integers so result matches NCCL exactly - inp1 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - inp2 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - torch.cuda.synchronize() - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph, - stream=graph_capture_context.stream): - for i in range(num_communication): - out1 = tensor_model_parallel_all_reduce(inp1) - # the input buffer is immediately modified to test - # synchronization - dist.all_reduce(inp1, group=group) - out2 = tensor_model_parallel_all_reduce(inp2) - dist.all_reduce(inp2, group=group) - graph.replay() - torch.testing.assert_close(out1, inp1) - torch.testing.assert_close(out2, inp2) + for sz in test_sizes: + for dtype in [torch.float32, torch.float16, torch.bfloat16]: + with graph_capture(device=device) as graph_capture_context: + # use integers so result matches NCCL exactly + inp1 = torch.randint(1, + 16, (sz, ), + dtype=dtype, + device=torch.cuda.current_device()) + inp2 = torch.randint(1, + 16, (sz, ), + dtype=dtype, + device=torch.cuda.current_device()) + torch.cuda.synchronize() + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, + stream=graph_capture_context.stream): + for i in range(num_communication): + out1 = tensor_model_parallel_all_reduce(inp1) + # the input buffer is immediately modified to test + # synchronization + dist.all_reduce(inp1, group=group) + out2 = tensor_model_parallel_all_reduce(inp2) + dist.all_reduce(inp2, group=group) + graph.replay() + torch.testing.assert_close(out1, inp1) + torch.testing.assert_close(out2, inp2) @ray.remote(num_gpus=1, max_calls=1) -def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(tp_size, pp_size, rank, - distributed_init_port) +def eager_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pp_size, + rank, + distributed_init_port, +): + with monkeypatch.context() as m: + m.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) - # we use the first group to communicate once - # and the second group to communicate twice - # and so on - # this is used to demonstrate that each group can - # communicate independently - num_communication = rank // tp_size + 1 - sz = 1024 - fa = get_tp_group().ca_comm - inp = torch.ones(sz, dtype=torch.float32, device=device) - out = inp - for _ in range(num_communication): - out = fa.all_reduce(out, registered=False) - torch.testing.assert_close(out, inp * (tp_size**num_communication)) + # we use the first group to communicate once + # and the second group to communicate twice + # and so on + # this is used to demonstrate that each group can + # communicate independently + num_communication = rank // tp_size + 1 + sz = 1024 + fa = get_tp_group().ca_comm + inp = torch.ones(sz, dtype=torch.float32, device=device) + out = inp + for _ in range(num_communication): + out = fa.all_reduce(out, registered=False) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) - inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) - out = inp - for _ in range(num_communication): - out = fa.all_reduce(out, registered=False) - torch.testing.assert_close(out, inp * (tp_size**num_communication)) + inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) + out = inp + for _ in range(num_communication): + out = fa.all_reduce(out, registered=False) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) -def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): +def test_custom_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pipeline_parallel_size, + test_target, +): world_size = tp_size * pipeline_parallel_size if world_size > torch.cuda.device_count(): pytest.skip("Not enough GPUs to run the test.") - multi_process_parallel(tp_size, pipeline_parallel_size, test_target) + multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, + test_target) diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py index 18c5be29c5ce1..7bf93f270148b 100644 --- a/tests/distributed/test_pipeline_partition.py +++ b/tests/distributed/test_pipeline_partition.py @@ -7,33 +7,35 @@ import pytest from vllm.distributed.utils import get_pp_indices -def test_custom_layer_partition(): +def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch): - def _verify(partition_str, num_layers, pp_size, goldens): - bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) - os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str - for pp_rank, golden in enumerate(goldens): - assert get_pp_indices(num_layers, pp_rank, pp_size) == golden - if bak is not None: - os.environ["VLLM_PP_LAYER_PARTITION"] = bak + with monkeypatch.context() as m: - # Even partition - _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Balanced partition - _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) - # Put reminder somewhere - _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) - # Invalid partition strings - with pytest.raises(ValueError): - _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - with pytest.raises(ValueError): - _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Wrong number of partitions - with pytest.raises(ValueError): - _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Wrong number of layers - with pytest.raises(ValueError): - _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + def _verify(partition_str, num_layers, pp_size, goldens): + bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) + m.setenv("VLLM_PP_LAYER_PARTITION", partition_str) + for pp_rank, golden in enumerate(goldens): + assert get_pp_indices(num_layers, pp_rank, pp_size) == golden + if bak is not None: + m.setenv("VLLM_PP_LAYER_PARTITION", bak) + + # Even partition + _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Balanced partition + _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) + # Put reminder somewhere + _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) + # Invalid partition strings + with pytest.raises(ValueError): + _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + with pytest.raises(ValueError): + _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Wrong number of partitions + with pytest.raises(ValueError): + _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Wrong number of layers + with pytest.raises(ValueError): + _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) @pytest.mark.parametrize( @@ -55,6 +57,10 @@ def test_custom_layer_partition(): (5, 3, 1, (2, 4)), (5, 3, 2, (4, 5)), ]) -def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int, - pp_rank: int, indices: tuple[int, int]): +def test_uneven_auto_partition( + num_hidden_layers: int, + pp_size: int, + pp_rank: int, + indices: tuple[int, int], +): assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size) diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 3bc85b05e7d15..19414971f2b46 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,11 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -import os +from typing import TYPE_CHECKING import pytest from ..utils import compare_two_settings, fork_new_process_for_each_test +if TYPE_CHECKING: + from typing_extensions import LiteralString + @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ (2, "JackFram/llama-160m"), @@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test "FLASHINFER", ]) @fork_new_process_for_each_test -def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): - cudagraph_args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--pipeline-parallel-size", - str(PP_SIZE), - "--distributed-executor-backend", - "mp", - ] - os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND +def test_pp_cudagraph( + monkeypatch: pytest.MonkeyPatch, + PP_SIZE: int, + MODEL_NAME: str, + ATTN_BACKEND: LiteralString, +): + with monkeypatch.context() as m: + cudagraph_args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--pipeline-parallel-size", + str(PP_SIZE), + "--distributed-executor-backend", + "mp", + ] + m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND) - eager_args = cudagraph_args + ["--enforce-eager"] + eager_args = cudagraph_args + ["--enforce-eager"] - compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) + compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 3ebc5a44d80c6..77fbb5827da9e 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4" @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 is currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(monkeypatch): +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V1 Engine.""" with monkeypatch.context() as m: @@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): run_test(more_args) -def test_lm_eval_accuracy_v0_engine(monkeypatch): +def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V0 Engine.""" with monkeypatch.context() as m: diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 85156d6931c8c..23fd72f4ebbb9 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -53,32 +53,37 @@ def cache_models(): @pytest.mark.skip_global_cleanup @pytest.mark.usefixtures("cache_models") -def test_offline_mode(monkeypatch): +def test_offline_mode(monkeypatch: pytest.MonkeyPatch): # Set HF to offline mode and ensure we can still construct an LLM - try: - monkeypatch.setenv("HF_HUB_OFFLINE", "1") - monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") + with monkeypatch.context() as m: + try: + m.setenv("HF_HUB_OFFLINE", "1") + m.setenv("VLLM_NO_USAGE_STATS", "1") - def disable_connect(*args, **kwargs): - raise RuntimeError("No http calls allowed") + def disable_connect(*args, **kwargs): + raise RuntimeError("No http calls allowed") - monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect", - disable_connect) - monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", - disable_connect) + m.setattr( + urllib3.connection.HTTPConnection, + "connect", + disable_connect, + ) + m.setattr( + urllib3.connection.HTTPSConnection, + "connect", + disable_connect, + ) - # Need to re-import huggingface_hub and friends to setup offline mode - _re_import_modules() - # Cached model files should be used in offline mode - for model_config in MODEL_CONFIGS: - LLM(**model_config) - finally: - # Reset the environment after the test - # NB: Assuming tests are run in online mode - monkeypatch.delenv("HF_HUB_OFFLINE") - monkeypatch.delenv("VLLM_NO_USAGE_STATS") - _re_import_modules() - pass + # Need to re-import huggingface_hub + # and friends to setup offline mode + _re_import_modules() + # Cached model files should be used in offline mode + for model_config in MODEL_CONFIGS: + LLM(**model_config) + finally: + # Reset the environment after the test + # NB: Assuming tests are run in online mode + _re_import_modules() def _re_import_modules(): diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index e4c087db3d4f0..d3948e2ed575e 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -70,7 +70,7 @@ def run_test(more_args): @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(monkeypatch): +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V1 Engine.""" with monkeypatch.context() as m: @@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): @pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): +def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, + more_args): """Run with the V0 Engine.""" with monkeypatch.context() as m: diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 570e643e0364d..66db7509cc474 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,13 +5,12 @@ from unittest.mock import Mock, patch import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.rocm import RocmPlatform -from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL +from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL @pytest.fixture(autouse=True) @@ -25,87 +24,111 @@ def clear_cache(): "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) @pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) -def test_env(name: str, use_v1: bool, device: str, monkeypatch): +def test_env( + name: str, + use_v1: bool, + device: str, + monkeypatch: pytest.MonkeyPatch, +): """Test that the attention selector can be set via environment variable. Note that we do not test FlashAttn because it is the default backend. """ - monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - override_backend_env_variable(monkeypatch, name) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, name) - if device == "cpu": - with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - assert backend.get_name() == "TORCH_SDPA" - elif device == "hip": - with patch("vllm.attention.selector.current_platform", RocmPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" - assert backend.get_name() == EXPECTED - elif device == "openvino": - with patch("vllm.attention.selector.current_platform", - OpenVinoPlatform()), patch.dict('sys.modules', - {'openvino': Mock()}): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - assert backend.get_name() == "OPENVINO" - else: - if name in ["XFORMERS", "FLASHINFER"]: + if device == "cpu": with patch("vllm.attention.selector.current_platform", - CudaPlatform()): + CpuPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == "TORCH_SDPA" + elif device == "hip": + with patch("vllm.attention.selector.current_platform", + RocmPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" assert backend.get_name() == EXPECTED + elif device == "openvino": + with patch("vllm.attention.selector.current_platform", + OpenVinoPlatform()), patch.dict('sys.modules', + {'openvino': Mock()}): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + assert backend.get_name() == "OPENVINO" + else: + if name in ["XFORMERS", "FLASHINFER"]: + with patch("vllm.attention.selector.current_platform", + CudaPlatform()): + backend = get_attn_backend(16, torch.float16, + torch.float16, 16, False) + EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == EXPECTED -def test_flash_attn(monkeypatch): +def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" # TODO: When testing for v1, pipe in `use_v1` as an argument to # get_attn_backend - override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL) - # Unsupported CUDA arch - with patch("torch.cuda.get_device_capability", return_value=(7, 5)): + # Unsupported CUDA arch + monkeypatch.setattr(torch.cuda, "get_device_capability", lambda: + (7, 5)) backend = get_attn_backend(16, torch.float16, None, 16, False) assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported data type - backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Reset the monkeypatch for subsequent tests + monkeypatch.undo() - # Unsupported kv cache data type - backend = get_attn_backend(16, torch.float16, "fp8", 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported data type + backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported block size - backend = get_attn_backend(16, torch.float16, None, 8, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported kv cache data type + backend = get_attn_backend(16, torch.float16, "fp8", 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL - # flash-attn is not installed - with patch.dict('sys.modules', {'vllm_flash_attn': None}): + # Unsupported block size + backend = get_attn_backend(16, torch.float16, None, 8, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # flash-attn is not installed + import sys + original_module = sys.modules.get('vllm_flash_attn') + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None) backend = get_attn_backend(16, torch.float16, None, 16, False) assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported head size - backend = get_attn_backend(17, torch.float16, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Restore the original module if it existed + if original_module is not None: + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', + original_module) + else: + monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False) - # Attention-free models should bypass env and use PlaceholderAttention - backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported head size + backend = get_attn_backend(17, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Attention-free models should bypass env and use PlaceholderAttention + backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + assert backend.get_name() != STR_FLASH_ATTN_VAL @pytest.mark.parametrize("use_v1", [True, False]) -def test_invalid_env(use_v1: bool, monkeypatch): - """Ignore the invalid env variable if it is set.""" - monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - override_backend_env_variable(monkeypatch, STR_INVALID_VAL) +def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): - with patch("vllm.attention.selector.current_platform", CudaPlatform()): + with monkeypatch.context() as m, patch( + "vllm.attention.selector.current_platform", CudaPlatform()): + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) + + # Test with head size 32 backend = get_attn_backend(32, torch.float16, None, 16, False) EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" assert backend.get_name() == EXPECTED diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py index 37ce00c74030a..248b294e546b3 100644 --- a/tests/kernels/test_awq.py +++ b/tests/kernels/test_awq.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os - import pytest import torch @@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"), reason="AWQ is not supported on this GPU type.") -def test_awq_dequantize_opcheck(): - os.environ["VLLM_USE_TRITON_AWQ"] = "0" - qweight = torch.randint(-2000000000, - 2000000000, (8192, 256), - device='cuda', - dtype=torch.int32) - scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) - zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) - split_k_iters = 0 - thx = 0 - thy = 0 - opcheck(torch.ops._C.awq_dequantize, - (qweight, scales, zeros, split_k_iters, thx, thy)) +def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_TRITON_AWQ", "0") + qweight = torch.randint(-2000000000, + 2000000000, (8192, 256), + device='cuda', + dtype=torch.int32) + scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) + zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) + split_k_iters = 0 + thx = 0 + thy = 0 + opcheck(torch.ops._C.awq_dequantize, + (qweight, scales, zeros, split_k_iters, thx, thy)) @pytest.mark.skip(reason="Not working; needs investigation.") @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"), reason="AWQ is not supported on this GPU type.") -def test_awq_gemm_opcheck(): - os.environ["VLLM_USE_TRITON_AWQ"] = "0" - input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) - qweight = torch.randint(-2000000000, - 2000000000, (8192, 256), - device='cuda', - dtype=torch.int32) - scales = torch.randint(-2000000000, - 2000000000, (64, 256), - device='cuda', - dtype=torch.int32) - qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) - split_k_iters = 8 - opcheck(torch.ops._C.awq_gemm, - (input, qweight, qzeros, scales, split_k_iters)) +def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_TRITON_AWQ", "0") + input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) + qweight = torch.randint(-2000000000, + 2000000000, (8192, 256), + device='cuda', + dtype=torch.int32) + scales = torch.randint(-2000000000, + 2000000000, (64, 256), + device='cuda', + dtype=torch.int32) + qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) + split_k_iters = 8 + opcheck(torch.ops._C.awq_gemm, + (input, qweight, qzeros, scales, split_k_iters)) diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py index 7cd6082486605..724f0af283f70 100644 --- a/tests/kernels/test_rocm_attention_selector.py +++ b/tests/kernels/test_rocm_attention_selector.py @@ -1,13 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.rocm import RocmPlatform +from vllm.utils import STR_BACKEND_ENV_VAR @pytest.fixture(autouse=True) @@ -17,15 +15,19 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() -def test_selector(monkeypatch): - """Test that the attention selector for ROCm. - """ - override_backend_env_variable(monkeypatch, "ROCM_FLASH") +def test_selector(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH") - with patch("vllm.attention.selector.current_platform", RocmPlatform()): + # Set the current platform to ROCm using monkeypatch + monkeypatch.setattr("vllm.attention.selector.current_platform", + RocmPlatform()) + + # Test standard ROCm attention backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) assert (backend.get_name() == "ROCM_FLASH" or backend.get_name() == "ROCM_ATTN_VLLM_V1") + # mla test for deepseek related backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, False, True) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 010974076ba8f..22b3d7c2be7a5 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = ( class QKVInputs(NamedTuple): ''' - Data structure for representing unpacked attention inputs, + Data structure for representing unpacked attention inputs, query/key/values and their sequence lengths. Attributes: - * {query,key,value}: unpacked (batch_size x padded_seq_len x + * {query,key,value}: unpacked (batch_size x padded_seq_len x num_heads x head_size) attention inputs * q_seq_lens: query sequence lengths list * kv_seq_lens: shared key/value sequence lengths list @@ -56,14 +56,14 @@ class QKVInputs(NamedTuple): class QKVO(NamedTuple): ''' - Data structure for representing unpacked attention inputs, + Data structure for representing unpacked attention inputs, alongside unpacked known-correct attention output Attributes: - * qkv: unpacked (batch_size x padded_seq_len x + * qkv: unpacked (batch_size x padded_seq_len x num_heads x head_size) attention inputs - * ideal_output: unpacked (batch_size x padded_seq_len x + * ideal_output: unpacked (batch_size x padded_seq_len x num_heads x head_size) known-correct attention output ''' @@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple): Attributes: - * {query,key,value}: packed (number_of_tokens x num_heads + * {query,key,value}: packed (number_of_tokens x num_heads x head_size) attention inputs * q_start_loc_list: list of query start locations within packed tensor * kv_start_loc_list: shared list of key/value start locations within @@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple): class PackedQKVO(NamedTuple): ''' - Data structure for representing packed attention inputs, + Data structure for representing packed attention inputs, alongside packed known-correct attention output Attributes: - * packed_qkv: packed (number_of_tokens x num_heads + * packed_qkv: packed (number_of_tokens x num_heads x head_size) attention inputs - * ideal_output: packed (number_of_tokens x num_heads + * ideal_output: packed (number_of_tokens x num_heads x head_size) known-correct attention output ''' @@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple): Attributes: - * packed_qkvo: packed (number_of_tokens x num_heads + * packed_qkvo: packed (number_of_tokens x num_heads x head_size) attention inputs & known-correct output * kv_mmap: KV cache memory mapping, specific to this test phase & @@ -195,7 +195,7 @@ def make_causal_mask( Create a q_max_seq_len x kv_max_seq_len causal mask Arguments: - + * q_max_seq_len: query max seq len * kv_max_seq_len: key/value max seq len @@ -320,9 +320,9 @@ def make_qkv( * max_kv_seq_len: max key/value seq len * num_heads * head_size - * is_encoder_decoder_attn: if True, query seqlen may differ from - key/value seqlen (as is often the case for cross-attention); - o/w, query/key/value seqlens match at each batch index + * is_encoder_decoder_attn: if True, query seqlen may differ from + key/value seqlen (as is often the case for cross-attention); + o/w, query/key/value seqlens match at each batch index (max_kv_seq_len is unused) * force_kv_seq_lens: if not None, overrides kv sequence lengths * attn_type: encoder, decoder self, or enc/dec cross attention @@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device, Individually pack each of Q, K and V, each with dimensions batch_size x padded_seq_len x num_heads x head_size, into respective number_of_tokens x num_heads x head_size tensors. - + For Q, number_of_tokens = sum(q_seq_lens). For K and V, number_of_tokens = sum(kv_seq_lens) @@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int, Returns: * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) - * for backend 'XFORMERS' + * for backend 'XFORMERS' * kv_cache: 2 x num_blocks x block_size x num_heads x head_size - * for backend 'FLASH_ATTN' + * for backend 'FLASH_ATTN' ''' if backend == 'XFORMERS': kv_cache = torch.rand( @@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], Context: * Your goal is to test (1) prefill of N prompts, with prompt-lengths {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token - for all N prompts (N tokens total); the resultant sequence lengths + for all N prompts (N tokens total); the resultant sequence lengths after decode would be {K_i + 1 for i \\in [0,N)} - * The test you want to do requires (1) having the prefill slot mapping - for all tokens present during prefill, the number of which is - M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N + * The test you want to do requires (1) having the prefill slot mapping + for all tokens present during prefill, the number of which is + M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N decoded tokens - - This function consumes a single 1D slot mapping, which is the + + This function consumes a single 1D slot mapping, which is the concatenation of N slot mappings each of length K_i + 1 (corresponding to the sequence lengths after decode), with a total length of P = \\sum_i{K_i + 1} = M + N The prefill-phase slot mapping results from excising the (K_i + 1)-th entry - from each of the N subsequences in the slot mapping (i.e. omitting the + from each of the N subsequences in the slot mapping (i.e. omitting the decoded token's mapping.) The N excised entries are appended to obtain the decode-phase slot mapping @@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N post-decode sequences - * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the + * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the description above) * device: cuda, cpu, etc. Returns: - * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) + * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) reflecting all N prefill prompts - * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting + * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting all N decoded tokens ''' @@ -725,7 +725,7 @@ def make_block_tables_slot_mapping( Then the minimum KV cache size in blocks is - total_cache_blocks = sum(num_blocks for all seqs) + total_cache_blocks = sum(num_blocks for all seqs) Then, the blocktable mapping counts downward from @@ -734,7 +734,7 @@ def make_block_tables_slot_mapping( to block_base_addr - + The constructed block-tables and slot-mapping are sized to the lengths of the sequences in their entirety (as reflected by seq_lens), @@ -749,7 +749,7 @@ def make_block_tables_slot_mapping( Return: - * block_tables_tensor: block table for sequence + * block_tables_tensor: block table for sequence * slot_mapping_list: slot mapping for sequence * max_block_idx: the highest block address within this block table ''' @@ -807,7 +807,7 @@ def make_test_metadata( encoder_test_params and cross_test_params arguments allow encoder attention and enc/dec cross-attention (respectively) to use distinct metadata values from decoder self-attention (decoder_test_params.) - + if encoder_test_params and cross_test_params are None, the attention metadata will support decoder-only scenario. @@ -820,7 +820,7 @@ def make_test_metadata( * attn_backend_name: Backend for sourcing attention kernels * is_prompt: prefill if True, o/w decode * seq_lens: list of token counts for each sequence - * decoder_test_params: decoder self-attention test params; + * decoder_test_params: decoder self-attention test params; this function requires kv_mmap (memory mapping) field * device: CPU or CUDA device diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/test_disagg.py similarity index 100% rename from tests/kv_transfer/disagg_test.py rename to tests/kv_transfer/test_disagg.py diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/test_module.py similarity index 100% rename from tests/kv_transfer/module_test.py rename to tests/kv_transfer/test_module.py diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index faca7a566e79c..51abcb7172cb7 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -12,11 +12,10 @@ import pytest from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ...utils import check_logprobs_close -os.environ["TOKENIZERS_PARALLELISM"] = "true" - @pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("fp8"), @@ -55,45 +54,47 @@ def test_models( backend: str, tensor_parallel_size: int, disable_async_output_proc: bool, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Only checks log probs match to cover the discrepancy in numerical sensitive kernels. """ - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv("TOKENIZERS_PARALLELISM", 'true') + m.setenv(STR_BACKEND_ENV_VAR, backend) - MAX_MODEL_LEN = 1024 - NUM_LOG_PROBS = 8 + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 - with vllm_runner( - base_model, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - baseline_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - with vllm_runner( - test_model, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - test_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=test_outputs, - name_0="fp16_kv_cache", - name_1="fp8_kv_cache", - ) + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="fp16_kv_cache", + name_1="fp8_kv_cache", + ) @pytest.mark.cpu_model @@ -119,38 +120,41 @@ def test_cpu_models( test_model: str, max_tokens: int, disable_async_output_proc: bool, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Only checks log probs match to cover the discrepancy in numerical sensitive kernels. """ + with monkeypatch.context() as m: + m.setenv("TOKENIZERS_PARALLELISM", 'true') - MAX_MODEL_LEN = 1024 - NUM_LOG_PROBS = 8 + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 - with vllm_runner( - base_model, - max_model_len=MAX_MODEL_LEN, - dtype="bfloat16", - kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - baseline_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - with vllm_runner( - test_model, - max_model_len=MAX_MODEL_LEN, - dtype="bfloat16", - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - test_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=test_outputs, - name_0="bf16_kv_cache", - name_1="fp8_kv_cache", - ) + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="bf16_kv_cache", + name_1="fp8_kv_cache", + ) diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index cae3e1a5c6244..d6bf7d2706397 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import importlib.util import math @@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine import vllm import vllm.config +from vllm.utils import STR_BACKEND_ENV_VAR from ....utils import RemoteOpenAIServer @@ -29,36 +31,34 @@ def _arr(arr): return array("i", arr) -def test_find_array(monkeypatch): +def test_find_array(monkeypatch: pytest.MonkeyPatch): # GritLM embedding implementation is only supported by XFormers backend. - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - from vllm.model_executor.models.gritlm import GritLMPooler + from vllm.model_executor.models.gritlm import GritLMPooler - # Create an LLM object to get the model config. - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - pooler = GritLMPooler(model_config=llm.llm_engine.model_config) + # Create an LLM object to get the model config. + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + pooler = GritLMPooler(model_config=llm.llm_engine.model_config) - arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 - assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 + assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 - with pytest.raises(ValueError): - pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) + with pytest.raises(ValueError): + pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) @pytest.fixture(scope="module") def server_embedding(): # GritLM embedding implementation is only supported by XFormers backend. - with pytest.MonkeyPatch.context() as mp: - mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") - - args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server + args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server @pytest.fixture(scope="module") @@ -69,9 +69,12 @@ def server_generate(): @pytest_asyncio.fixture -async def client_embedding(server_embedding: RemoteOpenAIServer): - async with server_embedding.get_async_client() as async_client: - yield async_client +async def client_embedding(monkeypatch: pytest.MonkeyPatch, + server_embedding: RemoteOpenAIServer): + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + async with server_embedding.get_async_client() as async_client: + yield async_client @pytest_asyncio.fixture @@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer): yield async_client -def run_llm_encode(llm: vllm.LLM, queries: list[str], - instruction: str) -> list[float]: +def run_llm_encode( + llm: vllm.LLM, + queries: list[str], + instruction: str, +) -> list[float]: outputs = llm.encode([instruction + q for q in queries], ) return [output.outputs.embedding for output in outputs] -async def run_client_embeddings(client: vllm.LLM, queries: list[str], - instruction: str) -> list[float]: +async def run_client_embeddings( + client: vllm.LLM, + queries: list[str], + instruction: str, +) -> list[float]: outputs = await client.embeddings.create( model=MODEL_NAME, input=[instruction + q for q in queries], @@ -106,7 +115,7 @@ def get_test_data(): README.md in https://github.com/ContextualAI/gritlm """ q_instruction = gritlm_instruction( - "Given a scientific paper title, retrieve the paper's abstract") + "Given a scientific paper title, retrieve the paper's abstract", ) queries = [ "Bitcoin: A Peer-to-Peer Electronic Cash System", "Generative Representational Instruction Tuning", @@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]): assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) -def test_gritlm_offline_embedding(monkeypatch): +def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): # GritLM embedding implementation is only supported by XFormers backend. - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - queries, q_instruction, documents, d_instruction = get_test_data() + queries, q_instruction, documents, d_instruction = get_test_data() - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - d_rep = run_llm_encode( - llm, - documents, - d_instruction, - ) - q_rep = run_llm_encode( - llm, - queries, - q_instruction, - ) + d_rep = run_llm_encode( + llm, + documents, + d_instruction, + ) + q_rep = run_llm_encode( + llm, + queries, + q_instruction, + ) - validate_embed_output(q_rep, d_rep) + validate_embed_output(q_rep, d_rep) @pytest.mark.asyncio async def test_gritlm_api_server_embedding( - client_embedding: openai.AsyncOpenAI): + client_embedding: openai.AsyncOpenAI, ): queries, q_instruction, documents, d_instruction = get_test_data() d_rep = await run_client_embeddings( diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index d3d07d0d9acfc..465c496f4c0f3 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os - import pytest from vllm import LLM, SamplingParams @@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test @fork_new_process_for_each_test -def test_plugin(dummy_opt_path, monkeypatch): +def test_plugin( + monkeypatch: pytest.MonkeyPatch, + dummy_opt_path: str, +): # V1 shuts down rather than raising an error here. - monkeypatch.setenv("VLLM_USE_V1", "0") - os.environ["VLLM_PLUGINS"] = "" - with pytest.raises(Exception) as excinfo: - LLM(model=dummy_opt_path, load_format="dummy") - error_msg = "has no vLLM implementation and " \ - "the Transformers implementation is not compatible with vLLM" - assert (error_msg in str(excinfo.value)) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + m.setenv("VLLM_PLUGINS", "") + + with pytest.raises(Exception) as excinfo: + LLM(model=dummy_opt_path, load_format="dummy") + error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501 + assert (error_msg in str(excinfo.value)) @fork_new_process_for_each_test -def test_oot_registration_text_generation(dummy_opt_path): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = ["Hello, my name is", "The text does not matter"] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=dummy_opt_path, load_format="dummy") - first_token = llm.get_tokenizer().decode(0) - outputs = llm.generate(prompts, sampling_params) +def test_oot_registration_text_generation( + monkeypatch: pytest.MonkeyPatch, + dummy_opt_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = ["Hello, my name is", "The text does not matter"] + sampling_params = SamplingParams(temperature=0) + llm = LLM(model=dummy_opt_path, load_format="dummy") + first_token = llm.get_tokenizer().decode(0) + outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - # make sure only the first token is generated - rest = generated_text.replace(first_token, "") - assert rest == "" + for output in outputs: + generated_text = output.outputs[0].text + # make sure only the first token is generated + rest = generated_text.replace(first_token, "") + assert rest == "" @fork_new_process_for_each_test -def test_oot_registration_embedding(dummy_gemma2_embedding_path): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = ["Hello, my name is", "The text does not matter"] - llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") - outputs = llm.embed(prompts) +def test_oot_registration_embedding( + monkeypatch: pytest.MonkeyPatch, + dummy_gemma2_embedding_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = ["Hello, my name is", "The text does not matter"] + llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") + outputs = llm.embed(prompts) - for output in outputs: - assert all(v == 0 for v in output.outputs.embedding) + for output in outputs: + assert all(v == 0 for v in output.outputs.embedding) image = ImageAsset("cherry_blossom").pil_image.convert("RGB") @fork_new_process_for_each_test -def test_oot_registration_multimodal(dummy_llava_path, monkeypatch): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = [{ - "prompt": "What's in the image?", - "multi_modal_data": { - "image": image - }, - }, { - "prompt": "Describe the image", - "multi_modal_data": { - "image": image - }, - }] +def test_oot_registration_multimodal( + monkeypatch: pytest.MonkeyPatch, + dummy_llava_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = [{ + "prompt": "What's in the image?", + "multi_modal_data": { + "image": image + }, + }, { + "prompt": "Describe the image", + "multi_modal_data": { + "image": image + }, + }] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=dummy_llava_path, - load_format="dummy", - max_num_seqs=1, - trust_remote_code=True, - gpu_memory_utilization=0.98, - max_model_len=4096, - enforce_eager=True, - limit_mm_per_prompt={"image": 1}) - first_token = llm.get_tokenizer().decode(0) - outputs = llm.generate(prompts, sampling_params) + sampling_params = SamplingParams(temperature=0) + llm = LLM(model=dummy_llava_path, + load_format="dummy", + max_num_seqs=1, + trust_remote_code=True, + gpu_memory_utilization=0.98, + max_model_len=4096, + enforce_eager=True, + limit_mm_per_prompt={"image": 1}) + first_token = llm.get_tokenizer().decode(0) + outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - # make sure only the first token is generated - rest = generated_text.replace(first_token, "") - assert rest == "" + for output in outputs: + generated_text = output.outputs[0].text + # make sure only the first token is generated + rest = generated_text.replace(first_token, "") + assert rest == "" diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index aad7fc5303c13..e617bd057f1f4 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket): @pytest.mark.asyncio -async def test_mp_crash_detection(monkeypatch): +async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: - parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) - args = parser.parse_args([]) + parser = FlexibleArgumentParser( + description="vLLM's remote OpenAI server.") + parser = make_arg_parser(parser) + args = parser.parse_args([]) - # When LLMEngine is loaded, it will crash. - def mock_init(): - raise ValueError + # When LLMEngine is loaded, it will crash. + def mock_init(): + raise ValueError - monkeypatch.setattr(LLMEngine, "__init__", mock_init) + m.setattr(LLMEngine, "__init__", mock_init) - start = time.perf_counter() - async with build_async_engine_client(args): - pass - end = time.perf_counter() + start = time.perf_counter() + async with build_async_engine_client(args): + pass + end = time.perf_counter() - assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s " - "if there is an error in the startup.") + assert end - start < 60, ( + "Expected vLLM to gracefully shutdown in <60s " + "if there is an error in the startup.") @pytest.mark.asyncio diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index f925e42f46d37..ce716e6474cb4 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -5,7 +5,7 @@ from typing import Optional import pytest -from tests.kernels.utils import override_backend_env_variable +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close from ..utils import (completions_with_server_args, get_client_text_generations, @@ -52,7 +52,7 @@ async def test_multi_step( num_logprobs: Optional[int], attention_backend: str, enable_chunked_prefill: bool, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step scheduling in an OpenAI-protocol client/server environment. @@ -82,67 +82,70 @@ async def test_multi_step( pytest.skip("Multi-step with Chunked-Prefill only supports" "PP=1 and FLASH_ATTN backend") - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] + server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] + ms_server_args = DEFAULT_SERVER_ARGS + \ + ["--num-scheduler-steps", f"{num_scheduler_steps}"] - if not is_async: - ms_server_args += ["--disable-async-output-proc"] + if not is_async: + ms_server_args += ["--disable-async-output-proc"] - if eager_mode: - ms_server_args.append("--enforce-eager") + if eager_mode: + ms_server_args.append("--enforce-eager") - if enable_chunked_prefill: - ms_server_args.append("--enable-chunked-prefill") + if enable_chunked_prefill: + ms_server_args.append("--enable-chunked-prefill") - distributed_args = [ - "--tensor-parallel-size", - str(tp_size), - "--pipeline-parallel-size", - str(pp_size), - ] + distributed_args = [ + "--tensor-parallel-size", + str(tp_size), + "--pipeline-parallel-size", + str(pp_size), + ] - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 5x to 1200 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts, - model, - server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) - test_completions = await completions_with_server_args( - prompts, - model, - ms_server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) + # Spin up client/server & issue completion API requests. + # Default `max_wait_seconds` is 240 but was empirically + # was raised 5x to 1200 *just for this test* due to + # observed timeouts in GHA CI + ref_completions = await completions_with_server_args( + prompts, + model, + server_args + distributed_args, + num_logprobs, + max_wait_seconds=5 * 240) + test_completions = await completions_with_server_args( + prompts, + model, + ms_server_args + distributed_args, + num_logprobs, + max_wait_seconds=5 * 240) - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) - assert ref_generations == test_generations + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) + assert ref_generations == test_generations - # Assert multi-step scheduling produces nearly-identical logprobs - # to single-step scheduling. - ref_text_logprobs = get_client_text_logprob_generations(ref_completions) - test_text_logprobs = get_client_text_logprob_generations(test_completions) - check_logprobs_close( - outputs_0_lst=ref_text_logprobs, - outputs_1_lst=test_text_logprobs, - name_0="hf", - name_1="vllm", - ) + # Assert multi-step scheduling produces nearly-identical logprobs + # to single-step scheduling. + ref_text_logprobs = get_client_text_logprob_generations( + ref_completions) + test_text_logprobs = get_client_text_logprob_generations( + test_completions) + check_logprobs_close( + outputs_0_lst=ref_text_logprobs, + outputs_1_lst=test_text_logprobs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize(("tp_size, pp_size"), [ @@ -152,7 +155,7 @@ async def test_multi_step( async def test_multi_step_pp_smoke( tp_size: int, pp_size: int, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Smoke test for the vLLM engine with multi-step scheduling in an @@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke( attention_backend = "FLASH_ATTN" max_num_seqs = 3 - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - # Prompt from the ShareGPT dataset - prompts = [ - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - ] - # Use varying max_tokens to introduce scheduling randomness. - max_tokens = [10 * i for i in range(1, len(prompts) + 1)] - assert len(prompts) == len(max_tokens) + # Prompt from the ShareGPT dataset + prompts = [ + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + ] + # Use varying max_tokens to introduce scheduling randomness. + max_tokens = [10 * i for i in range(1, len(prompts) + 1)] + assert len(prompts) == len(max_tokens) - test_args = [ - "--tensor-parallel-size", - str(tp_size), "--pipeline-parallel-size", - str(pp_size), "--max-num-seqs", - str(max_num_seqs) - ] + test_args = [ + "--tensor-parallel-size", + str(tp_size), "--pipeline-parallel-size", + str(pp_size), "--max-num-seqs", + str(max_num_seqs) + ] - server_args = DEFAULT_SERVER_ARGS + test_args - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ - test_args + server_args = DEFAULT_SERVER_ARGS + test_args + ms_server_args = DEFAULT_SERVER_ARGS + \ + ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ + test_args - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 3x to 720 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) + # Spin up client/server & issue completion API requests. + # Default `max_wait_seconds` is 240 but was empirically + # was raised 3x to 720 *just for this test* due to + # observed timeouts in GHA CI + ref_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) - test_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=ms_server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) + test_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=ms_server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) - assert ref_generations == test_generations + assert ref_generations == test_generations diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 29d5ffd4c9cb1..a823e484beab6 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -7,7 +7,7 @@ from typing import Optional import pytest -from tests.kernels.utils import override_backend_env_variable +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close, check_outputs_equal @@ -42,7 +42,7 @@ def test_multi_step_llm( num_prompts: int, num_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step scheduling via sync LLM Engine. @@ -70,48 +70,49 @@ def test_multi_step_llm( num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> 1 logprob returned. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=enable_chunked_prefill, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs)) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + enable_chunked_prefill=enable_chunked_prefill, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + vllm_model.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs)) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - hf_model.generate_greedy_logprobs_limit( - prompts, max_tokens, num_logprobs)) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + hf_model.generate_greedy_logprobs_limit( + prompts, max_tokens, num_logprobs)) - if num_logprobs is None: - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if num_logprobs is None: + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) @@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs( num_logprobs: Optional[int], num_prompt_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test prompt logprobs with multi-step scheduling via sync LLM Engine. @@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs( note that this argument is not supported by the OpenAI completions endpoint. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - ) as vllm_model: - single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + ) as vllm_model: + single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) - check_logprobs_close( - outputs_0_lst=single_step_vllm_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_logprobs_close( + outputs_0_lst=single_step_vllm_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) @@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( num_prompts: int, num_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step+"single-step chunked prefill"+APC. @@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( # # The Incorrect scheduling behavior - if it occurs - will cause an exception # in the model runner resulting from `do_sample=False`. - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - assert len(example_prompts) >= 2 - challenge_prompts = copy.deepcopy(example_prompts) - challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient ' - 'inference and serving engine for LLMs.\n' - ) # 24 tok - challenge_prompts[1] = ( - 'Briefly describe the major milestones in the ' - 'development of artificial intelligence from 1950 to 2020.\n' - ) # 30 tok + assert len(example_prompts) >= 2 + challenge_prompts = copy.deepcopy(example_prompts) + challenge_prompts[0] = ( + 'vLLM is a high-throughput and memory-efficient ' + 'inference and serving engine for LLMs.\n') # 24 tok + challenge_prompts[1] = ( + 'Briefly describe the major milestones in the ' + 'development of artificial intelligence from 1950 to 2020.\n' + ) # 30 tok - # If necessary, adjust the length of `challenge_prompts` to match - # `num_prompts` - if len(challenge_prompts) < num_prompts: - challenge_prompts = (challenge_prompts * - ((num_prompts // len(challenge_prompts)) + 1)) - challenge_prompts = challenge_prompts[:num_prompts] - assert len(challenge_prompts) == num_prompts + # If necessary, adjust the length of `challenge_prompts` to match + # `num_prompts` + if len(challenge_prompts) < num_prompts: + challenge_prompts = (challenge_prompts * + ((num_prompts // len(challenge_prompts)) + 1)) + challenge_prompts = challenge_prompts[:num_prompts] + assert len(challenge_prompts) == num_prompts - # Single-step scheduler baseline - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_baseline = (vllm_model.generate_greedy( - challenge_prompts, max_tokens) if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) + # Single-step scheduler baseline + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + num_scheduler_steps=num_scheduler_steps, + max_model_len=48, + max_num_batched_tokens=48, + max_num_seqs=4, + block_size=16, + ) as vllm_model: + outputs_baseline = ( + vllm_model.generate_greedy(challenge_prompts, max_tokens) if + num_logprobs is None else vllm_model.generate_greedy_logprobs( + challenge_prompts, max_tokens, num_logprobs)) - # multi-step+"single-step chunked prefill"+APC - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=True, - enable_prefix_caching=True, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_w_features = (vllm_model.generate_greedy( - challenge_prompts, max_tokens) if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) + # multi-step+"single-step chunked prefill"+APC + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + enable_chunked_prefill=True, + enable_prefix_caching=True, + num_scheduler_steps=num_scheduler_steps, + max_model_len=48, + max_num_batched_tokens=48, + max_num_seqs=4, + block_size=16, + ) as vllm_model: + outputs_w_features = ( + vllm_model.generate_greedy(challenge_prompts, max_tokens) if + num_logprobs is None else vllm_model.generate_greedy_logprobs( + challenge_prompts, max_tokens, num_logprobs)) - if num_logprobs is None: - # No-logprobs test - check_outputs_equal( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) - else: - # Yes-logprobs test - check_logprobs_close( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) + if num_logprobs is None: + # No-logprobs test + check_outputs_equal( + outputs_0_lst=outputs_baseline, + outputs_1_lst=outputs_w_features, + name_0="multi-step", + name_1="multi-step+features", + ) + else: + # Yes-logprobs test + check_logprobs_close( + outputs_0_lst=outputs_baseline, + outputs_1_lst=outputs_w_features, + name_0="multi-step", + name_1="multi-step+features", + ) diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py index 30dcdd573edf3..033a36b4156b0 100644 --- a/tests/neuron/1_core/test_block_table.py +++ b/tests/neuron/1_core/test_block_table.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import os import neuronxcc.nki.language as nl import pytest @@ -99,6 +98,7 @@ def ref_block_tables_transform( ) @torch.inference_mode() def test_load_and_transform_block_tables( + monkeypatch: pytest.MonkeyPatch, num_tiles, num_blocks_per_tile, q_head_per_kv_head, @@ -108,46 +108,46 @@ def test_load_and_transform_block_tables( device = xm.xla_device() - compiler_flags = [ + compiler_flags_str = " ".join([ "-O1", "--retry_failed_compilation", - ] - compiler_flags_str = " ".join(compiler_flags) - os.environ["NEURON_CC_FLAGS"] = compiler_flags_str + ]) + with monkeypatch.context() as m: + m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - torch.manual_seed(10000) - torch.set_printoptions(sci_mode=False) + torch.manual_seed(10000) + torch.set_printoptions(sci_mode=False) - # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient - B_P_SIZE = 128 - if num_blocks_per_tile < B_P_SIZE: - assert B_P_SIZE % num_blocks_per_tile == 0 - block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile - else: - block_size_tiling_factor = 1 - max_num_blocks = 100000 - block_tables = torch.randint( - 0, - max_num_blocks, - (num_tiles * num_blocks_per_tile, ), - dtype=torch.int32, - ) - nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( - block_tables.to(device=device), - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ).cpu() - ref_out = ref_block_tables_transform( - block_tables, - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ) - assert (nki_out.shape == ref_out.shape - ), f"{nki_out.shape=} != {ref_out.shape=}" - assert torch.all(nki_out == ref_out) + # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient + B_P_SIZE = 128 + if num_blocks_per_tile < B_P_SIZE: + assert B_P_SIZE % num_blocks_per_tile == 0 + block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile + else: + block_size_tiling_factor = 1 + max_num_blocks = 100000 + block_tables = torch.randint( + 0, + max_num_blocks, + (num_tiles * num_blocks_per_tile, ), + dtype=torch.int32, + ) + nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( + block_tables.to(device=device), + num_tiles, + num_blocks_per_tile, + q_head_per_kv_head, + head_id, + block_size_tiling_factor, + ).cpu() + ref_out = ref_block_tables_transform( + block_tables, + num_tiles, + num_blocks_per_tile, + q_head_per_kv_head, + head_id, + block_size_tiling_factor, + ) + assert (nki_out.shape == ref_out.shape + ), f"{nki_out.shape=} != {ref_out.shape=}" + assert torch.all(nki_out == ref_out) diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py index 326a1f82e9b30..37d6679f8d55b 100644 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, ]) @torch.inference_mode() def test_contexted_kv_attention( + monkeypatch: pytest.MonkeyPatch, prefill_batch_size: int, decode_batch_size: int, num_heads: int, @@ -329,7 +330,6 @@ def test_contexted_kv_attention( large_tile_size, mixed_precision: bool, ) -> None: - import os import torch_xla.core.xla_model as xm @@ -340,174 +340,178 @@ def test_contexted_kv_attention( device = xm.xla_device() - compiler_flags = [ + compiler_flags_str = " ".join([ "-O1", "--retry_failed_compilation", - ] - compiler_flags_str = " ".join(compiler_flags) - os.environ["NEURON_CC_FLAGS"] = compiler_flags_str + ]) + with monkeypatch.context() as m: + m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - torch.manual_seed(0) - torch.set_printoptions(sci_mode=False) - torch.set_default_device("cpu") - dtype = torch.float32 + torch.manual_seed(0) + torch.set_printoptions(sci_mode=False) + torch.set_default_device("cpu") + dtype = torch.float32 - min_ctx_len = 32 - max_ctx_len = 1024 - min_query_len = 16 - max_query_len = 512 - num_kv_heads = num_heads // num_queries_per_kv - ( - query, - k_active, - v_active, - k_cache, - v_cache, - block_table, - key, - value, - query_lens, - seq_lens, - ) = sample_inputs( - prefill_batch_size=prefill_batch_size, - decode_batch_size=decode_batch_size, - min_query_len=min_query_len, - max_query_len=max_query_len, - min_ctx_len=min_ctx_len, - max_ctx_len=max_ctx_len, - block_size=block_size, - num_heads=num_heads, - num_kv_heads=num_kv_heads, - head_size=head_size, - dtype=dtype, - ) + min_ctx_len = 32 + max_ctx_len = 1024 + min_query_len = 16 + max_query_len = 512 + num_kv_heads = num_heads // num_queries_per_kv + ( + query, + k_active, + v_active, + k_cache, + v_cache, + block_table, + key, + value, + query_lens, + seq_lens, + ) = sample_inputs( + prefill_batch_size=prefill_batch_size, + decode_batch_size=decode_batch_size, + min_query_len=min_query_len, + max_query_len=max_query_len, + min_ctx_len=min_ctx_len, + max_ctx_len=max_ctx_len, + block_size=block_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_size=head_size, + dtype=dtype, + ) - output_ref = ref_context_attention( - query, - key, - value, - query_lens, - seq_lens, - head_size, - num_queries_per_kv, - return_max_reduce=False, - ) + output_ref = ref_context_attention( + query, + key, + value, + query_lens, + seq_lens, + head_size, + num_queries_per_kv, + return_max_reduce=False, + ) - # build neuron program - B_P_SIZE = 128 - assert (large_tile_size >= B_P_SIZE - ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" + # build neuron program + B_P_SIZE = 128 + assert (large_tile_size >= B_P_SIZE + ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" - def ceil_div(a, b): - return (a + b - 1) // b + def ceil_div(a, b): + return (a + b - 1) // b - def pad_to_multiple(a, b): - return ceil_div(a, b) * b + def pad_to_multiple(a, b): + return ceil_div(a, b) * b - def pad_to_next_power_of_2(a): - assert a > 0 - return 2**int(a - 1).bit_length() + def pad_to_next_power_of_2(a): + assert a > 0 + return 2**int(a - 1).bit_length() - # calculate input shapes - max_num_queries = pad_to_next_power_of_2(sum(query_lens)) - context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - num_active_blocks = ceil_div(context_lens, block_size).sum().item() - num_active_blocks = pad_to_multiple(num_active_blocks, - large_tile_size // block_size) - context_kv_len = num_active_blocks * block_size - assert (context_kv_len % + # calculate input shapes + max_num_queries = pad_to_next_power_of_2(sum(query_lens)) + context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) + num_active_blocks = ceil_div(context_lens, block_size).sum().item() + num_active_blocks = pad_to_multiple(num_active_blocks, + large_tile_size // block_size) + context_kv_len = num_active_blocks * block_size + assert ( + context_kv_len % large_tile_size == 0), f"invalid context_kv_len={context_kv_len}" - # pad QKV tensors - pad_dims = ( - 0, - 0, - 0, - 0, - 0, - max_num_queries - query.shape[0], - ) - query = F.pad(query, pad_dims, "constant", 0) - k = F.pad(k_active, pad_dims, "constant", 0) - v = F.pad(v_active, pad_dims, "constant", 0) - - # permute QKV tensors - # query: (1, n_heads, d, seq_q) - # key: (1, n_kv_heads, d, seq_k) - # value: (1, n_kv_heads, seq_v, d) - query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() - k_cache = k_cache.permute(0, 2, 1, 3).contiguous() - v_cache = v_cache.permute(0, 2, 1, 3).contiguous() - - # transform block table - active_block_table = get_active_block_tables( - block_table.cpu(), - torch.tensor(query_lens).cpu(), - torch.tensor(seq_lens).cpu(), - block_size, - num_active_blocks, - ) - - # Build attention masks - prior_mask, active_mask = ( - BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens, block_size=block_size)) - prior_mask_padded = F.pad( - prior_mask, - ( + # pad QKV tensors + pad_dims = ( 0, - context_kv_len - prior_mask.shape[1], 0, - max_num_queries - prior_mask.shape[0], - ), - "constant", - 0, - ).bool() - active_mask_padded = F.pad( - active_mask, - ( 0, - max_num_queries - active_mask.shape[1], 0, - max_num_queries - active_mask.shape[0], - ), - "constant", - 0, - ).bool() - attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1) + 0, + max_num_queries - query.shape[0], + ) + query = F.pad(query, pad_dims, "constant", 0) + k = F.pad(k_active, pad_dims, "constant", 0) + v = F.pad(v_active, pad_dims, "constant", 0) - attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size) + # permute QKV tensors + # query: (1, n_heads, d, seq_q) + # key: (1, n_kv_heads, d, seq_k) + # value: (1, n_kv_heads, seq_v, d) + query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() + k_cache = k_cache.permute(0, 2, 1, 3).contiguous() + v_cache = v_cache.permute(0, 2, 1, 3).contiguous() - input_args = ( - query.to(device=device), - k.to(device=device), - v.to(device=device), - k_cache.to(device=device), - v_cache.to(device=device), - active_block_table.to(device=device), - attn_mask.to(device=device), - ) - input_kwargs = dict( - n_kv_head=num_kv_heads, - head_size=head_size, - mixed_precision=mixed_precision, - LARGE_TILE_SZ=large_tile_size, - ) + # transform block table + active_block_table = get_active_block_tables( + block_table.cpu(), + torch.tensor(query_lens).cpu(), + torch.tensor(seq_lens).cpu(), + block_size, + num_active_blocks, + ) - output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + # Build attention masks + prior_mask, active_mask = ( + BlockDiagonalCausalFromBottomRightMask.from_seqlens( + query_lens, seq_lens, block_size=block_size)) + prior_mask_padded = F.pad( + prior_mask, + ( + 0, + context_kv_len - prior_mask.shape[1], + 0, + max_num_queries - prior_mask.shape[0], + ), + "constant", + 0, + ).bool() + active_mask_padded = F.pad( + active_mask, + ( + 0, + max_num_queries - active_mask.shape[1], + 0, + max_num_queries - active_mask.shape[0], + ), + "constant", + 0, + ).bool() + attn_mask = torch.concat([prior_mask_padded, active_mask_padded], + dim=1) - num_actual_tokens = sum(query_lens) - # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) - output_nki = output_nki.cpu().permute(0, 2, 1, 3) - output_nki = output_nki[0, :num_actual_tokens, :, :] - output_ref_padded = F.pad( - output_ref, - (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), - "constant", - 0, - ) - output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :] + attn_mask = reorder_context_mask(attn_mask, large_tile_size, + block_size) - torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) + input_args = ( + query.to(device=device), + k.to(device=device), + v.to(device=device), + k_cache.to(device=device), + v_cache.to(device=device), + active_block_table.to(device=device), + attn_mask.to(device=device), + ) + input_kwargs = dict( + n_kv_head=num_kv_heads, + head_size=head_size, + mixed_precision=mixed_precision, + LARGE_TILE_SZ=large_tile_size, + ) + + output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + + num_actual_tokens = sum(query_lens) + # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) + output_nki = output_nki.cpu().permute(0, 2, 1, 3) + output_nki = output_nki[0, :num_actual_tokens, :, :] + output_ref_padded = F.pad( + output_ref, + (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), + "constant", + 0, + ) + output_ref = output_ref_padded.transpose( + 0, 1)[0, :num_actual_tokens, :, :] + + torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 3be248f5aca45..9d6872e0e0772 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import get_attn_backend -from vllm.utils import STR_INVALID_VAL +from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL def test_platform_plugins(): @@ -25,8 +25,9 @@ def test_platform_plugins(): f" is loaded. The first import:\n{_init_trace}") -def test_oot_attention_backend(monkeypatch): +def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch): # ignore the backend env variable if it is set - override_backend_env_variable(monkeypatch, STR_INVALID_VAL) - backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - assert backend.get_name() == "Dummy_Backend" + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) + backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + assert backend.get_name() == "Dummy_Backend" diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 98981a81e909c..7abf5066a4133 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler): raise Exception("Exception raised by DummyV1Scheduler") -def test_scheduler_plugins_v0(monkeypatch): - monkeypatch.setenv("VLLM_USE_V1", "0") - with pytest.raises(Exception) as exception_info: +def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + with pytest.raises(Exception) as exception_info: - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV0Scheduler, - ) + engine_args = EngineArgs( + model="facebook/opt-125m", + enforce_eager=True, # reduce test time + scheduler_cls=DummyV0Scheduler, + ) - engine = LLMEngine.from_engine_args(engine_args=engine_args) + engine = LLMEngine.from_engine_args(engine_args=engine_args) - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() + sampling_params = SamplingParams(max_tokens=1) + engine.add_request("0", "foo", sampling_params) + engine.step() - assert str(exception_info.value) == "Exception raised by DummyV0Scheduler" + assert str( + exception_info.value) == "Exception raised by DummyV0Scheduler" -def test_scheduler_plugins_v1(monkeypatch): - monkeypatch.setenv("VLLM_USE_V1", "1") - # Explicitly turn off engine multiprocessing so that the scheduler runs in - # this process - monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") +def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + # Explicitly turn off engine multiprocessing so + # that the scheduler runs in this process + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") - with pytest.raises(Exception) as exception_info: + with pytest.raises(Exception) as exception_info: - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV1Scheduler, - ) + engine_args = EngineArgs( + model="facebook/opt-125m", + enforce_eager=True, # reduce test time + scheduler_cls=DummyV1Scheduler, + ) - engine = V1LLMEngine.from_engine_args(engine_args=engine_args) + engine = V1LLMEngine.from_engine_args(engine_args=engine_args) - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() + sampling_params = SamplingParams(max_tokens=1) + engine.add_request("0", "foo", sampling_params) + engine.step() - assert str(exception_info.value) == "Exception raised by DummyV1Scheduler" + assert str( + exception_info.value) == "Exception raised by DummyV1Scheduler" diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 7a4bc7aecc0f4..607b6c43e02e2 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -4,25 +4,29 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ +from __future__ import annotations + import pytest from tests.conftest import VllmRunner from tests.core.utils import SchedulerProxy, create_dummy_prompt -from tests.kernels.utils import override_backend_env_variable from vllm import SamplingParams, TokensPrompt from vllm.core.scheduler import Scheduler from vllm.engine.llm_engine import LLMEngine from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_outputs_equal @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ This module relies on V0 internals, so set VLLM_USE_V1=0. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield MODELS = [ @@ -56,7 +60,7 @@ def test_mixed_requests( cached_position: int, enable_chunked_prefill: bool, block_size: int, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Test the case when some sequences have the prefix cache hit @@ -67,72 +71,77 @@ def test_mixed_requests( pytest.skip("Flashinfer does not support ROCm/HIP.") if backend == "XFORMERS" and current_platform.is_rocm(): pytest.skip("Xformers does not support ROCm/HIP.") - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, backend) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - cached_prompt = example_prompts[cached_position] - with vllm_runner( - model, - dtype=dtype, - enable_prefix_caching=True, - enable_chunked_prefill=enable_chunked_prefill, - block_size=block_size, - ) as vllm_model: - # Run the first prompt so the cache is populated - vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) + cached_prompt = example_prompts[cached_position] + with vllm_runner( + model, + dtype=dtype, + enable_prefix_caching=True, + enable_chunked_prefill=enable_chunked_prefill, + block_size=block_size, + ) as vllm_model: + # Run the first prompt so the cache is populated + vllm_outputs = vllm_model.generate_greedy([cached_prompt], + max_tokens) - # Run all the promopts - greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - req_outputs = vllm_model.model.generate(example_prompts, greedy_params) + # Run all the promopts + greedy_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens) + req_outputs = vllm_model.model.generate(example_prompts, + greedy_params) - # Verify number of cached tokens - for i in range(len(req_outputs)): - if i == cached_position: - expected_num_cached_tokens = ( - len(req_outputs[i].prompt_token_ids) // - block_size) * block_size - else: - expected_num_cached_tokens = 0 - assert ( - req_outputs[i].num_cached_tokens == expected_num_cached_tokens) + # Verify number of cached tokens + for i in range(len(req_outputs)): + if i == cached_position: + expected_num_cached_tokens = ( + len(req_outputs[i].prompt_token_ids) // + block_size) * block_size + else: + expected_num_cached_tokens = 0 + assert (req_outputs[i].num_cached_tokens == + expected_num_cached_tokens) - vllm_outputs = [( - output.prompt_token_ids + list(output.outputs[0].token_ids), - output.prompt + output.outputs[0].text, - ) for output in req_outputs] + vllm_outputs = [( + output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text, + ) for output in req_outputs] - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) def test_unstable_prompt_sequence( vllm_runner, backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") if backend == "XFORMERS" and current_platform.is_rocm(): pytest.skip("Xformers does not support ROCm/HIP.") - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, backend) - with vllm_runner( - "Qwen/Qwen2.5-0.5B-Instruct", - enable_chunked_prefill=True, - enable_prefix_caching=True, - max_model_len=4096, - ) as vllm_model: - for prompt in UNSTABLE_PROMPT_SEQUENCE: - vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), - SamplingParams(max_tokens=1)) + with vllm_runner( + "Qwen/Qwen2.5-0.5B-Instruct", + enable_chunked_prefill=True, + enable_prefix_caching=True, + max_model_len=4096, + ) as vllm_model: + for prompt in UNSTABLE_PROMPT_SEQUENCE: + vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), + SamplingParams(max_tokens=1)) @pytest.mark.parametrize("model", MODELS) diff --git a/tests/test_regression.py b/tests/test_regression.py index b54dc6af3e9a6..8c9d4a91c73be 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -56,12 +56,11 @@ def test_gc(): assert allocated < 50 * 1024 * 1024 -def test_model_from_modelscope(monkeypatch): +def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary - MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat" - monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True") - try: - llm = LLM(model=MODELSCOPE_MODEL_NAME) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") prompts = [ "Hello, my name is", @@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch): outputs = llm.generate(prompts, sampling_params) assert len(outputs) == 4 - finally: - monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/test_utils.py b/tests/test_utils.py index dcca7d5965e9e..ae4fddd046d45 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +# ruff: noqa import asyncio -import os import socket from collections.abc import AsyncIterator from unittest.mock import patch @@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message(): dummy(old_arg=1) -def test_get_open_port(): - os.environ["VLLM_PORT"] = "5678" - # make sure we can get multiple ports, even if the env var is set - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: - s1.bind(("localhost", get_open_port())) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2: - s2.bind(("localhost", get_open_port())) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: - s3.bind(("localhost", get_open_port())) - os.environ.pop("VLLM_PORT") +def test_get_open_port(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_PORT", "5678") + # make sure we can get multiple ports, even if the env var is set + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: + s1.bind(("localhost", get_open_port())) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2: + s2.bind(("localhost", get_open_port())) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: + s3.bind(("localhost", get_open_port())) # Tests for FlexibleArgumentParser @@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention(): assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] -def test_bind_kv_cache_encoder_decoder(monkeypatch): +def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch): # V1 TESTS: ENCODER_DECODER is not supported on V1 yet. - monkeypatch.setenv("VLLM_USE_V1", "0") + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") - from vllm.attention import Attention, AttentionType + from vllm.attention import Attention, AttentionType - # example from bart - ctx = { - 'encoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), - 'decoder.layers.0.encoder_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), - 'decoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), - } + # example from bart + ctx = { + 'encoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), + 'decoder.layers.0.encoder_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), + 'decoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), + } - kv_cache = [ - torch.zeros((1, )), - ] - encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache + kv_cache = [ + torch.zeros((1, )), + ] + encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache - bind_kv_cache(ctx, [kv_cache]) - assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache - assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] - assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] + bind_kv_cache(ctx, [kv_cache]) + assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache + assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] + assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] def test_bind_kv_cache_pp(): diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index e94bbd2877225..f7a59f054b61b 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -import os +import pytest from vllm.config import CompilationLevel @@ -9,16 +9,17 @@ from ..utils import compare_two_settings # --enforce-eager on TPU causes graph compilation # this times out default Health Check in the MQLLMEngine, # so we set the timeout here to 30s -os.environ["VLLM_RPC_TIMEOUT"] = "30000" -def test_custom_dispatcher(): - compare_two_settings( - "google/gemma-2b", - arg1=[ - "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", - ], - arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], - env1={}, - env2={}) +def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_RPC_TIMEOUT", "30000") + compare_two_settings( + "google/gemma-2b", + arg1=[ + "--enforce-eager", + f"-O{CompilationLevel.DYNAMO_ONCE}", + ], + arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], + env1={}, + env2={}) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 97149884497af..a781b8b563be1 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 +# ruff: noqa +# type: ignore +from __future__ import annotations -import os import threading from collections.abc import Iterable from concurrent import futures -from typing import Callable, Literal +from typing import Callable, Generator, Literal import grpc import pytest @@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ Since this module is V0 only, set VLLM_USE_V1=0 for all tests in the module. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" @@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer): @pytest.fixture -def trace_service(): +def trace_service() -> Generator[FakeTraceService, None, None]: """Fixture to set up a fake gRPC trace service""" server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) service = FakeTraceService() @@ -80,136 +84,153 @@ def trace_service(): server.stop(None) -def test_traces(trace_service): - os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" +def test_traces( + monkeypatch: pytest.MonkeyPatch, + trace_service: FakeTraceService, +): + with monkeypatch.context() as m: + m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) + sampling_params = SamplingParams( + temperature=0.01, + top_p=0.1, + max_tokens=256, + ) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - # Model forward and model execute should be none, since detailed traces is - # not enabled. - assert metrics.model_forward_time is None - assert metrics.model_execute_time is None + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS + ) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE + ) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time + # Model forward and model execute should be none, since detailed traces is + # not enabled. + assert metrics.model_forward_time is None + assert metrics.model_execute_time is None -def test_traces_with_detailed_steps(trace_service): - os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" +def test_traces_with_detailed_steps( + monkeypatch: pytest.MonkeyPatch, + trace_service: FakeTraceService, +): + with monkeypatch.context() as m: + m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - collect_detailed_traces="all", - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) + sampling_params = SamplingParams( + temperature=0.01, + top_p=0.1, + max_tokens=256, + ) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + collect_detailed_traces="all", + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - assert metrics.model_forward_time > 0 - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( - metrics.model_forward_time / 1000) - assert metrics.model_execute_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE - ) == metrics.model_execute_time - assert metrics.model_forward_time < 1000 * metrics.model_execute_time + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS + ) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE + ) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time + assert metrics.model_forward_time > 0 + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD + ) == pytest.approx(metrics.model_forward_time / 1000) + assert metrics.model_execute_time > 0 + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE + ) == metrics.model_execute_time + assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/utils.py b/tests/utils.py index fc19c8d031b16..06ba8a2421c16 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -566,6 +566,7 @@ def init_test_distributed_environment( def multi_process_parallel( + monkeypatch: pytest.MonkeyPatch, tp_size: int, pp_size: int, test_target: Any, @@ -582,7 +583,13 @@ def multi_process_parallel( refs = [] for rank in range(tp_size * pp_size): refs.append( - test_target.remote(tp_size, pp_size, rank, distributed_init_port)) + test_target.remote( + monkeypatch, + tp_size, + pp_size, + rank, + distributed_init_port, + ), ) ray.get(refs) ray.shutdown() @@ -700,7 +707,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: """ Get a pytest mark, which skips the test if the GPU doesn't meet a minimum memory requirement in GB. - + This can be leveraged via `@large_gpu_test` to skip tests in environments without enough resources, or called when filtering tests to run directly. """ diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py index 519a74cab84bc..6cca324514565 100644 --- a/tests/v1/e2e/test_ngram_spec_decode.py +++ b/tests/v1/e2e/test_ngram_spec_decode.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import random +from typing import Any import pytest @@ -50,8 +53,12 @@ def model_name(): return "meta-llama/Meta-Llama-3-8B-Instruct" -def test_ngram_correctness(monkeypatch, test_prompts, sampling_config, - model_name): +def test_ngram_correctness( + monkeypatch: pytest.MonkeyPatch, + test_prompts: list[list[dict[str, Any]]], + sampling_config: SamplingParams, + model_name: str, +): ''' Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 5b9725d59ddc5..0ff804976ada6 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM, [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_load(monkeypatch, output_kind: RequestOutputKind, - engine_args_and_prompt: tuple[AsyncEngineArgs, - PromptType]): +async def test_load( + monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, + engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType], +): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. @@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind, [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_abort(monkeypatch, output_kind: RequestOutputKind, +async def test_abort(monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType]): diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 5fdbcf5b99636..2ec4f7e034af8 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest: @fork_new_process_for_each_test -def test_engine_core(monkeypatch): +def test_engine_core(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -159,10 +159,10 @@ def test_engine_core(monkeypatch): @fork_new_process_for_each_test -def test_engine_core_advanced_sampling(monkeypatch): +def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): """ - A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as top_p, min_tokens, and + A basic end-to-end test to verify that the engine functions correctly + when additional sampling parameters, such as top_p, min_tokens, and presence_penalty, are set. """ with monkeypatch.context() as m: @@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch): @fork_new_process_for_each_test -def test_engine_core_concurrent_batches(monkeypatch): +def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): """ Test that the engine can handle multiple concurrent batches. """ diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index e646ccbd46030..004b4dc82f4d9 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str: @fork_new_process_for_each_test @pytest.mark.parametrize("multiprocessing_mode", [True, False]) -def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): +def test_engine_core_client(monkeypatch: pytest.MonkeyPatch, + multiprocessing_mode: bool): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): @pytest.mark.asyncio(loop_scope="function") -async def test_engine_core_client_asyncio(monkeypatch): +async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index e763aa2c86998..3800cb392fbad 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -57,7 +57,7 @@ def _repeat_logprob_config( logprob_prompt_logprob_list: BatchLogprobsSpecType, ) -> BatchLogprobsSpecType: """Ensure each test prompt has a logprob config. - + A logprob config specifies the optional (i.e. may-be-`None`) number of sample logprobs and the optional number of prompt logprobs. @@ -80,7 +80,7 @@ def _repeat_logprob_config( (optional num sample logprob, optional num prompt logprob) tuples - + Returns: list of (optional num sample logprob,optional num prompt logprob) @@ -255,14 +255,12 @@ def _run_and_validate( [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]) @pytest.mark.parametrize("temperature", [0.0, 2.0]) def test_get_logprobs_and_prompt_logprobs( - hf_model, - vllm_model, - batch_logprobs_composition: BatchLogprobsComposition, - temperature: float, - example_prompts, -) -> None: + hf_model, vllm_model, + batch_logprobs_composition: BatchLogprobsComposition, + temperature: float, example_prompts: list[str], + monkeypatch: pytest.MonkeyPatch) -> None: """Test V1 Engine logprobs & prompt logprobs - + Exercise a variety of combinations of `logprobs` and `prompt_logprobs` settings and validate that * The generated logprobs and prompt logprobs are consistent with the @@ -279,7 +277,7 @@ def test_get_logprobs_and_prompt_logprobs( To save time, only test one APC-enabled scenario (sample & prompt logprobs enabled, temperature>0.0). - + Args: hf_model: HuggingFace reference model fixture vllm_model: vLLM model fixture @@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs( temperature: "temperature" sampling parameter example_prompts: example prompt fixture """ - do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching - if do_apc and (temperature < 2.0 - or batch_logprobs_composition != SAMPLE_PROMPT): - # Skip some test-cases to save time. - pytest.skip() - test_prompts = example_prompts + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching + if do_apc and (temperature < 2.0 + or batch_logprobs_composition != SAMPLE_PROMPT): + # Skip some test-cases to save time. + pytest.skip() + test_prompts = example_prompts - max_tokens = 5 - hf_outputs = hf_model.generate_greedy( - test_prompts, - max_tokens=max_tokens, - ) - hf_logprobs = hf_model.generate_greedy_logprobs( - test_prompts, - max_tokens=max_tokens, - ) - - # Batch has mixed sample params - # (different logprobs/prompt logprobs combos) - logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition) - - # Ensure that each test prompt has a logprob config for testing - logprob_prompt_logprob_list = _repeat_logprob_config( - test_prompts, logprob_prompt_logprob_list) - # Generate SamplingParams - vllm_sampling_params = [ - SamplingParams(max_tokens=max_tokens, - logprobs=num_lp, - prompt_logprobs=num_plp, - temperature=temperature, - seed=1984) - for num_lp, num_plp in logprob_prompt_logprob_list - ] - for _ in range(2 if do_apc else 1): - _run_and_validate( - vllm_model=vllm_model, - test_prompts=test_prompts, - vllm_sampling_params=vllm_sampling_params, - hf_logprobs=hf_logprobs, - hf_outputs=hf_outputs, - logprob_prompt_logprob_list=logprob_prompt_logprob_list, - temperature=temperature, + max_tokens = 5 + hf_outputs = hf_model.generate_greedy( + test_prompts, max_tokens=max_tokens, - do_apc=do_apc) + ) + hf_logprobs = hf_model.generate_greedy_logprobs( + test_prompts, + max_tokens=max_tokens, + ) + + # Batch has mixed sample params + # (different logprobs/prompt logprobs combos) + logprob_prompt_logprob_list = get_test_batch( + batch_logprobs_composition) + + # Ensure that each test prompt has a logprob config for testing + logprob_prompt_logprob_list = _repeat_logprob_config( + test_prompts, logprob_prompt_logprob_list) + # Generate SamplingParams + vllm_sampling_params = [ + SamplingParams(max_tokens=max_tokens, + logprobs=num_lp, + prompt_logprobs=num_plp, + temperature=temperature, + seed=1984) + for num_lp, num_plp in logprob_prompt_logprob_list + ] + for _ in range(2 if do_apc else 1): + _run_and_validate( + vllm_model=vllm_model, + test_prompts=test_prompts, + vllm_sampling_params=vllm_sampling_params, + hf_logprobs=hf_logprobs, + hf_outputs=hf_outputs, + logprob_prompt_logprob_list=logprob_prompt_logprob_list, + temperature=temperature, + max_tokens=max_tokens, + do_apc=do_apc) -def test_max_logprobs(): +def test_max_logprobs(monkeypatch: pytest.MonkeyPatch): """vLLM v1 engine should fail a request with `logprobs > max_logprobs` - Should also fail for `prompt_logprobs > max_logprobs` - APC should not matter as this test checks basic request validation. - - Args: - monkeypatch """ + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") - runner = VllmRunner("facebook/opt-125m", - max_logprobs=1, - enable_prefix_caching=False, - max_model_len=256) - vllm_sampling_params = SamplingParams(logprobs=1) - # should pass - runner.generate(["Hello world"], sampling_params=vllm_sampling_params) + runner = VllmRunner("facebook/opt-125m", + max_logprobs=1, + enable_prefix_caching=False, + max_model_len=256) + vllm_sampling_params = SamplingParams(logprobs=1) + # should pass + runner.generate(["Hello world"], sampling_params=vllm_sampling_params) - bad_sampling_params = SamplingParams(logprobs=2) - with pytest.raises(ValueError): - runner.generate(["Hello world"], sampling_params=bad_sampling_params) + bad_sampling_params = SamplingParams(logprobs=2) + with pytest.raises(ValueError): + runner.generate(["Hello world"], + sampling_params=bad_sampling_params) -def test_none_logprobs(vllm_model, example_prompts): +def test_none_logprobs(vllm_model, example_prompts, + monkeypatch: pytest.MonkeyPatch): """Engine should return `logprobs` and `prompt_logprobs` as `None` - + Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - max_tokens = 5 + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + max_tokens = 5 - sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, - logprobs=None, - prompt_logprobs=None, - temperature=0.0) - results_logprobs_none = vllm_model.model.generate( - example_prompts, sampling_params=sampling_params_logprobs_none) + sampling_params_logprobs_none = SamplingParams( + max_tokens=max_tokens, + logprobs=None, + prompt_logprobs=None, + temperature=0.0, + ) + results_logprobs_none = vllm_model.model.generate( + example_prompts, + sampling_params=sampling_params_logprobs_none, + ) - for i in range(len(results_logprobs_none)): - # Check sample logprobs are None - assert results_logprobs_none[i].outputs[0].logprobs is None - assert results_logprobs_none[i].outputs[0].cumulative_logprob is None - # Check prompt logprobs are None - assert results_logprobs_none[i].prompt_logprobs is None + for i in range(len(results_logprobs_none)): + # Check sample logprobs are None + assert results_logprobs_none[i].outputs[0].logprobs is None + assert results_logprobs_none[i].outputs[ + 0].cumulative_logprob is None + # Check prompt logprobs are None + assert results_logprobs_none[i].prompt_logprobs is None -def test_zero_logprobs(vllm_model, example_prompts): +def test_zero_logprobs(vllm_model, example_prompts, + monkeypatch: pytest.MonkeyPatch): """Engine should return sampled token and prompt token logprobs - + Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - max_tokens = 5 + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + max_tokens = 5 - sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, - logprobs=0, - prompt_logprobs=0, - temperature=0.0) - results_logprobs_zero = vllm_model.model.generate( - example_prompts, sampling_params=sampling_params_logprobs_zero) + sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, + logprobs=0, + prompt_logprobs=0, + temperature=0.0) + results_logprobs_zero = vllm_model.model.generate( + example_prompts, sampling_params=sampling_params_logprobs_zero) - for i in range(len(results_logprobs_zero)): - # Check that there is one sample logprob dict for each - # sample token - logprobs = results_logprobs_zero[i].outputs[0].logprobs - prompt_logprobs = results_logprobs_zero[i].prompt_logprobs - sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids - prompt_token_ids = results_logprobs_zero[i].prompt_token_ids - assert logprobs is not None - assert len(sampled_token_ids) == len(logprobs) - assert results_logprobs_zero[i].outputs[ - 0].cumulative_logprob is not None - # Check that there is one prompt logprob dict for each - # prompt token - assert prompt_logprobs is not None - assert len(prompt_token_ids) == len(prompt_logprobs) + for i in range(len(results_logprobs_zero)): + # Check that there is one sample logprob dict for each + # sample token + logprobs = results_logprobs_zero[i].outputs[0].logprobs + prompt_logprobs = results_logprobs_zero[i].prompt_logprobs + sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids + prompt_token_ids = results_logprobs_zero[i].prompt_token_ids + assert logprobs is not None + assert len(sampled_token_ids) == len(logprobs) + assert results_logprobs_zero[i].outputs[ + 0].cumulative_logprob is not None + # Check that there is one prompt logprob dict for each + # prompt token + assert prompt_logprobs is not None + assert len(prompt_token_ids) == len(prompt_logprobs) diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index 0309f545ea49e..241f49e4faea8 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -3,11 +3,16 @@ Run `pytest tests/v1/tpu/test_basic.py`. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + import pytest from vllm.platforms import current_platform -from ...conftest import VllmRunner +if TYPE_CHECKING: + from tests.conftest import VllmRunner MODELS = [ # "Qwen/Qwen2-7B-Instruct", @@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1] @pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES) def test_models( - monkeypatch, + vllm_runner: type[VllmRunner], + monkeypatch: pytest.MonkeyPatch, model: str, max_tokens: int, enforce_eager: bool, @@ -41,7 +47,7 @@ def test_models( with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - with VllmRunner( + with vllm_runner( model, max_model_len=8192, enforce_eager=enforce_eager, @@ -50,5 +56,5 @@ def test_models( tensor_parallel_size=tensor_parallel_size) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - output = vllm_outputs[0][1] - assert "1024" in output + output = vllm_outputs[0][1] + assert "1024" in output From 583a9778e0bc65b031bc3e430d8f13655f727ec7 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 16 Mar 2025 21:48:11 -0700 Subject: [PATCH 21/34] [Benchmark] Do not save detailed info to json by default (#14879) Signed-off-by: simon-mo --- benchmarks/backend_request_func.py | 5 ++++- benchmarks/benchmark_serving.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 6a7db920b5b63..09c8e23ebb1c3 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.model_executor.model_loader.weight_utils import get_lock +# NOTE(simon): do not import vLLM here so the benchmark script +# can run without vLLM installed. AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download + from vllm.model_executor.model_loader.weight_utils import get_lock + # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(pretrained_model_name_or_path): diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1dd01ca968678..47627126b6688 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -684,6 +684,15 @@ def main(args: argparse.Namespace): "Invalid metadata format. Please use KEY=VALUE format." ) + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", "output_lens", "ttfts", "itls", + "generated_texts", "errors" + ]: + if field in result_json: + del result_json[field] + # Traffic result_json["request_rate"] = (args.request_rate if args.request_rate < float("inf") else "inf") @@ -828,6 +837,12 @@ if __name__ == "__main__": action="store_true", help="Specify to save benchmark results to a json file", ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) parser.add_argument( "--metadata", metavar="KEY=VALUE", From 8d6cf89526ff983b7eb74aad3903138004ae95cd Mon Sep 17 00:00:00 2001 From: Lily Liu Date: Sun, 16 Mar 2025 22:00:20 -0700 Subject: [PATCH 22/34] [V1] [Spec Decode] Support random sampling for spec decode (#13933) Co-authored-by: Woosuk Kwon --- tests/v1/sample/test_rejection_sampler.py | 301 +++++++++++++--- vllm/v1/sample/rejection_sampler.py | 400 +++++++++++++++------- vllm/v1/sample/sampler.py | 8 - vllm/v1/spec_decode/utils.py | 22 ++ vllm/v1/worker/gpu_model_runner.py | 31 +- 5 files changed, 568 insertions(+), 194 deletions(-) create mode 100644 vllm/v1/spec_decode/utils.py diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 190927745f1fe..84139a40b544a 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -1,37 +1,51 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Any, Optional import pytest import torch +import torch.nn.functional as F from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler +DEVICE = "cpu" + @pytest.fixture def sampler(): return RejectionSampler() -def create_logits_tensor(token_ids: list[int], +def create_logits_tensor(token_ids: list[list[int]], vocab_size: int = 100) -> torch.Tensor: """Helper function to create logits tensor that will produce desired token ids on argmax""" - logits = torch.full((len(token_ids), vocab_size), -100.0).cuda() - for i, token_id in enumerate(token_ids): - logits[i, token_id] = 100.0 + num_total_tokens = sum(len(tokens) for tokens in token_ids) + logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE) + start_loc = 0 + for tokens in token_ids: + for j, token_id in enumerate(tokens): + logits[start_loc + j, token_id] = 100.0 + start_loc += len(tokens) return logits -def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata: - batch_size = len(spec_tokens) +def create_sampling_metadata( + all_greedy: bool, + generators: Optional[dict[int, Any]] = None) -> SamplingMetadata: + """Create a v1 sampling metadata object with all_greedy set + to the given value. Either all greedy or all random sampling + is used. + """ + generators = generators or {} return SamplingMetadata( temperature=torch.tensor([]), - all_greedy=True, - all_random=False, + all_greedy=all_greedy, + all_random=not all_greedy, top_p=None, top_k=None, - min_p=torch.empty(batch_size, ), - generators={}, + min_p=torch.empty(1, ), + generators=generators, max_num_logprobs=0, no_penalties=False, prompt_token_ids=None, @@ -40,129 +54,310 @@ def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata: repetition_penalties=torch.tensor([]), output_token_ids=[], min_tokens={}, - logit_bias=[None] * batch_size, + logit_bias=[None], allowed_token_ids_mask=None, bad_words_token_ids={}, ) +########################### Tests for Greedy Sampling ################### def test_perfect_match(sampler): """Test when output tokens perfectly match speculated tokens""" spec_tokens = [[1, 2, 3]] - output_tokens = [1, 2, 3, 4] # 4 is the bonus token + output_tokens = [[1, 2, 3, 4]] # 4 is the bonus token - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2, 3, 4]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_early_mismatch(sampler): """Test when there's an early mismatch in tokens""" spec_tokens = [[1, 2, 3]] - output_tokens = [1, 5, 3, 4] # Mismatch at position 1 + output_tokens = [[1, 5, 3, 4]] # Mismatch at position 1 - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_multiple_sequences(sampler): """Test handling multiple sequences of speculated tokens""" spec_tokens = [[1, 2], [3]] - output_tokens = [1, 2, 5, 3, 4] # Two sequences with bonus tokens 5 and 4 + output_tokens = [[1, 2, 5], [3, + 4]] # Two sequences with bonus tokens 5 and 4 - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor( + [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_single_token_sequence(sampler): """Test handling sequences with single token""" spec_tokens = [[1]] - output_tokens = [1, 2] # Single token with bonus token 2 + output_tokens = [[1, 2]] # Single token with bonus token 2 - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_empty_sequence(sampler): """Test handling empty sequence of speculated tokens""" spec_tokens: list[list[int]] = [[]] - output_tokens = [5] # Just the bonus token + output_tokens = [[5]] # Just the bonus token - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[5]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_multiple_mismatches(sampler): """Test handling multiple sequences with mismatches""" spec_tokens = [[1, 2, 3], [4, 5, 6]] - output_tokens = [1, 2, 7, 6, 4, 8, 6, 9] # Mismatches in both sequences + output_tokens = [[1, 2, 7, 6], [4, 8, 6, + 9]] # Mismatches in both sequences - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor( + [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID], [4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) @pytest.mark.parametrize( "spec_tokens,output_tokens,expected", [ - ([[1, 2]], [1, 2, 3], [[1, 2, 3]]), # Perfect match with bonus - ([[1]], [2, 3], [[2, INVALID_TOKEN_ID]]), # First mismatch - ([[1, 2], [3, 4]], [1, 5, 6, 3, 4, 7], [[1, 5, INVALID_TOKEN_ID], - [3, 4, 7]]), # Mixed matches + ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]), # Perfect match with bonus + ([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]), # First mismatch + ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]], + [[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]), # Mixed matches ]) def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected): """Parametrized test for various matching scenarios""" - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected_tensor = torch.tensor(expected, dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected_tensor) + assert torch.equal(output, expected_tensor) -def test_logits_shape_handling(sampler): - """Test handling of different logits tensor shapes""" - spec_tokens = [[1, 2]] - output_tokens = [1, 2, 3] - vocab_size = 1000 +########################### Tests for Random Sampling ################### +@pytest.mark.parametrize("k", [1, 3, 5]) +@pytest.mark.parametrize("vocab_size", [1000]) +@pytest.mark.parametrize("batch_size", [1, 4, 8]) +@pytest.mark.parametrize("frac_seeded", [0.0, 0.5]) +@pytest.mark.parametrize("n_rep", [20]) +def test_deterministic_when_seeded(sampler, k: int, vocab_size: int, + batch_size: int, frac_seeded: float, + n_rep: int): + draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + target_probs = torch.rand(batch_size * (k + 1), + vocab_size, + dtype=torch.float32) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) - metadata = create_sampling_metadata(spec_tokens) - logits = create_logits_tensor(output_tokens, vocab_size) + seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded - output = sampler(spec_tokens, logits, metadata) - expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) - assert logits.shape[-1] == vocab_size + results = [] + for _ in range(n_rep): + seeded_seqs = { + i: torch.Generator(device=DEVICE).manual_seed(i) + for i in range(batch_size) if seeded_mask[i] + } + + sampling_metadata = create_sampling_metadata(all_greedy=False, + generators=seeded_seqs) + rep_result = sampler(draft_token_ids.tolist(), draft_probs, + bonus_token_ids, target_probs, sampling_metadata) + + results.append(rep_result) + + for i in range(batch_size): + if seeded_mask[i]: + for j in range(1, n_rep): + assert torch.equal(results[j][i], results[0][i]) + + +def test_rejection_sampling_approximates_target_distribution(): + """Verify rejection sampling approximates target distribution, + despite sampling from a potentially distinct draft distribution. + + This is done by first creating a random target probability + distribution and a random draft probability distribution. We then + sample token ids from the rejection sampler using these draft + and target distributions. The samples are used to estimate + the output probability distribution, which we expect to approximate + the target distribution. + + A basic distance metric is used to determine similarity between + distributions. + + We expect that as we increase the number of samples, + the distance between the observed distribution and the target + distribution decreases. To measure this, we compare the distance + of the observed distribution against both the target distribution + and a uniform random distribution. We expect the distance between + the observed distribution and the target distribution to improve + much more than the distance improvement between the observed + distribution and the random distribution. + """ + torch.set_default_device(DEVICE) + vocab_size = 10 + k = 2 + num_reference_probs = 100 + + # Prepare draft, target, and reference probability distributions + draft_probs, target_probs = (F.softmax( + torch.rand(vocab_size, dtype=torch.float32), + dim=-1, + ) for _ in range(2)) + reference_probs = F.softmax( + torch.rand(num_reference_probs, vocab_size, dtype=torch.float32), + dim=-1, + ) + + sample_sizes = [10, 100, 1_000, 10_000, 100_000] + distance_wrt_reference: list[float] = [] + distance_wrt_target: list[float] = [] + + for num_samples in sample_sizes: + # Sample using rejection sampling. + rej_sample_probs = estimate_rejection_sampling_pdf( + draft_probs, target_probs, k, vocab_size, num_samples) + rej_sample_probs = rej_sample_probs.to(DEVICE) + + # Average distance from reference probs. + reference_vs_rejsample_dist = torch.dist( + reference_probs, + rej_sample_probs).item() / reference_probs.shape[0] + target_vs_rejsample_dist = torch.dist(target_probs, + rej_sample_probs).item() + + distance_wrt_reference.append(reference_vs_rejsample_dist) + distance_wrt_target.append(target_vs_rejsample_dist) + + relative_change_in_distance_wrt_target = get_ratio_first_to_last( + distance_wrt_target) + relative_change_in_distance_wrt_reference = get_ratio_first_to_last( + distance_wrt_reference) + + print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} " + f"{reference_vs_rejsample_dist=:.05f}") + print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} " + f"{relative_change_in_distance_wrt_reference=:.02f}") + + relative_change_in_distance_wrt_target = get_ratio_first_to_last( + distance_wrt_target) + relative_change_in_distance_wrt_reference = get_ratio_first_to_last( + distance_wrt_reference) + + expected_improvement_multiplier = 20 + assert (relative_change_in_distance_wrt_target + > relative_change_in_distance_wrt_reference * + expected_improvement_multiplier) + + +def get_ratio_first_to_last(elements: list[float]) -> float: + return elements[0] / elements[-1] + + +def estimate_rejection_sampling_pdf( + draft_probs: torch.Tensor, + target_probs: torch.Tensor, + k: int, + vocab_size: int, + num_samples: int, +) -> torch.Tensor: + """Estimate the probability distribution of the output tokens + using rejection sampling. + + Args: + draft_probs: Draft probability distribution. + target_probs: Target probability distribution. + num_samples: Number of samples to draw. + + Returns: + Estimated probability distribution of the output tokens. + """ + sampler = RejectionSampler() + # Repeat draft probs num_samples times. + draft_probs = draft_probs.reshape(1, 1, + vocab_size).repeat(num_samples, k, 1) + + # Repeat target probs num_samples * (k + 1) times. + target_probs = target_probs.reshape(1, 1, vocab_size).repeat( + num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size) + + # Randomly sample draft token ids from draft probs. + draft_token_ids = torch.multinomial(draft_probs[:, 0, :], + num_samples=k, + replacement=True).reshape( + num_samples, k) + + # Bonus tokens not used but required. + bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, + device=DEVICE).repeat(num_samples, 1) + + sampling_metadata = create_sampling_metadata(all_greedy=False) + output_token_ids = sampler(draft_token_ids.tolist(), draft_probs, + bonus_token_ids, target_probs, + sampling_metadata) + output_token_ids = output_token_ids[:, :-1].flatten() + + hist = torch.histogram(output_token_ids.to(dtype=torch.float, + device="cpu"), + bins=vocab_size, + range=(0, vocab_size), + density=True) + + return hist.hist diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index ea7f3353c115f..5601c62e91fc0 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,87 +1,89 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Optional import torch import torch.nn as nn from torch.nn.utils.rnn import pad_sequence -from vllm import envs from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata - -try: - import flashinfer.sampling as fs - is_flashinfer_available = True -except ImportError: - is_flashinfer_available = False +from vllm.v1.spec_decode.utils import random_sample logger = init_logger(__name__) INVALID_TOKEN_ID = -1 class RejectionSampler(nn.Module): + """ + The implementation strictly follows the algorithm described in + https://arxiv.org/abs/2211.17192. + However, we want to clarify the terminology used in the implementation: + accepted tokens: tokens that are accepted based on the relationship + between the "raw" draft and target probabilities. + recovered tokens: tokens that are sampled based on the adjusted probability + distribution, which is derived from both the draft and target + probabilities. + bonus tokens: + If all proposed tokens are accepted, the bonus token is added to the + end of the sequence. The bonus token is only sampled from the target + probabilities. We pass in the bonus tokens instead of sampling them + in the rejection sampler to allow for more flexibility in the + sampling process. For example, we can use top_p, top_k sampling for + bonus tokens, while spec decode does not support these sampling + strategies. + output tokens: + Tokens are finally generated with the rejection sampler. + output tokens = accepted tokens + recovered tokens + bonus tokens + """ def __init__(self): super().__init__() - if current_platform.is_cuda(): - if is_flashinfer_available: - if envs.VLLM_USE_FLASHINFER_SAMPLER is not False: - # FIXME(woosuk): Currently, we have errors when using - # FlashInfer for rejection sampling. As a workaround, we - # disable FlashInfer for rejection sampling by default. - logger.info("Currently, FlashInfer rejection sampler is " - "disabled because of a bug. Falling back to " - "the PyTorch-native implementation of " - "rejection sampling.") - self.forward_method = self.forward_native - # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for - # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by - # default it is unused). For backward compatibility, we set - # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and - # interpret it differently in V0 and V1 samplers: In V0, - # None means False, while in V1, None means True. This is - # why we use the condition - # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. - # logger.info("Using FlashInfer for rejection sampling.") - # self.forward_method = self.flashinfer_sample - else: - logger.warning( - "FlashInfer is available, but it is not enabled. " - "Falling back to the PyTorch-native implementation of " - "rejection sampling. For the best performance, " - "please set VLLM_USE_FLASHINFER_SAMPLER=1.") - self.forward_method = self.forward_native - else: - logger.warning( - "FlashInfer is not available. Falling back to the PyTorch-" - "native implementation of rejection sampling. For the " - "best performance, please install FlashInfer.") - self.forward_method = self.forward_native - else: - self.forward_method = self.forward_native - - def forward(self, draft_token_ids: list[list[int]], - target_probs: torch.Tensor, - sampling_metadata: SamplingMetadata) -> SamplerOutput: - if not sampling_metadata.all_greedy: - raise NotImplementedError( - "Currently, only greedy sampling is supported by " - "rejection sampler.") - return self.forward_method(draft_token_ids, target_probs, - sampling_metadata) - - def flashinfer_sample( + def forward( self, draft_token_ids: list[list[int]], - target_probs: torch.Tensor, + draft_probs: Optional[torch.Tensor], + bonus_token_ids_tensor: torch.Tensor, # [batch_size, 1] + target_probs: torch.Tensor, # [num_total_tokens, vocab_size] sampling_metadata: SamplingMetadata, - ) -> SamplerOutput: + ) -> torch.Tensor: + ''' + Args: + draft_token_ids (List[List[int]]): + A 2D list of token IDs for each request in the batch. + Each request might have different number of draft tokens. + It may also contain empty lists for requests that have + no draft tokens. + draft_probs (Optional[torch.Tensor]): + Probability distribution for the draft tokens. Shape is + [batch_size, max_spec_len, vocab_size]. Can be None if + probabilities are not provided, which is the case for + ngram spec decode. + bonus_token_ids_tensor (torch.Tensor): + A tensor containing bonus tokens. Shape is [batch_size, 1]. + Bonus tokens are added to the end of the sequence if all + proposed tokens are accepted. We generate the bonus tokens + outside of the rejection sampler with the default sampling + strategy. It allows for more flexibility in the sampling + process such as top_p, top_k sampling. + target_probs (torch.Tensor): + Target model probability distribution. + Shape is [num_total_tokens, vocab_size]. num_total_tokens + is the total number of tokens from all requests. Here, + probabilities from different requests are flattened into + a single tensor because this is the shape of the output + logits. + sampling_metadata (SamplingMetadata): + Additional metadata needed for sampling, such as temperature, + top-k/top-p parameters, or other relevant information. + Returns: + output_token_ids (torch.Tensor): + A tensor containing the final output token IDs. + ''' + # NOTE: The following input preparationg can be moved # to the model runner with a persistent manner for better # performance. - sample_lens = [len(x) + 1 for x in draft_token_ids] # Convert draft token IDs to a tensor, split by sample_lens, then pad. draft_token_ids = [ torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids @@ -90,90 +92,171 @@ class RejectionSampler(nn.Module): batch_first=True, padding_value=INVALID_TOKEN_ID) - if sampling_metadata.all_greedy: - target_token_ids = target_probs.argmax(dim=-1).view(-1) - target_token_ids = target_token_ids.split(sample_lens) - target_token_ids = pad_sequence(target_token_ids, - batch_first=True, - padding_value=INVALID_TOKEN_ID) + # NOTE: CPU <-> GPU synchronization happens here. + draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device) + # Create one-hot tensor for draft token ids. + # This is used for ngram where we don't have draft_probs. + if draft_probs is None and not sampling_metadata.all_greedy: vocab_size = target_probs.size(-1) - # NOTE: CPU <-> GPU synchronization happens here. - draft_token_ids_tensor = draft_token_ids_tensor.to( - target_probs.device) draft_probs = _create_greedy_token_probs(draft_token_ids_tensor, vocab_size, target_probs.device) - target_probs = _create_greedy_token_probs(target_token_ids, - vocab_size, - target_probs.device) - uniform_samples = torch.zeros(draft_token_ids_tensor.size(0), - draft_token_ids_tensor.size(1) + 1, - device=target_probs.device) - else: - raise NotImplementedError( - "Currently, only greedy sampling is supported by " - "rejection sampler.") + sample_lens = [len(x) + 1 for x in draft_token_ids] + target_probs = _convert_2d_probs(target_probs, sample_lens) - sampled_token_ids, _, _ = fs.chain_speculative_sampling( - draft_probs, - draft_token_ids_tensor, - uniform_samples, - target_probs, - ) - return SamplerOutput(sampled_token_ids=sampled_token_ids, - logprobs_tensors=None) + return self.forward_native(draft_token_ids_tensor, draft_probs, + bonus_token_ids_tensor, target_probs, + sampling_metadata) # TODO: The following method can be optimized for better performance. def forward_native( self, - draft_token_ids: list[list[int]], + draft_token_ids_tensor: torch.Tensor, + # [batch_size, max_spec_len, vocab_size] + draft_probs: Optional[torch.Tensor], + bonus_token_ids_tensor: torch.Tensor, + # [batch_size, max_spec_len + 1, vocab_size] target_probs: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> SamplerOutput: - sample_lens = [len(x) + 1 for x in draft_token_ids] - # Convert draft token IDs to a tensor, split by sample_lens, then pad. - draft_token_ids = [ - torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids - ] - draft_token_ids_tensor = pad_sequence(draft_token_ids, - batch_first=True, - padding_value=INVALID_TOKEN_ID) - draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device) + ) -> torch.Tensor: # Add 1 to include the 'bonus' token. if sampling_metadata.all_greedy: - output_token_ids = target_probs.argmax(dim=-1).view(-1) - output_token_ids = output_token_ids.split(sample_lens) - output_token_ids = pad_sequence(output_token_ids, - batch_first=True, - padding_value=INVALID_TOKEN_ID) # Produce a mask that remains 1 (True) until the first # mismatch (cumprod turns 0 after a mismatch). - accept_mask = ( - output_token_ids[:, :-1] == draft_token_ids_tensor).cumprod( - dim=1) - else: - raise NotImplementedError( - "Currently, only greedy sampling is supported by " - "rejection sampler.") - # Identify valid positions (non-padding). - valid_mask = output_token_ids != INVALID_TOKEN_ID - # Generate mask with bonus token. - generate_mask = torch.cat([ - accept_mask, - torch.zeros(accept_mask.size(0), 1, device=accept_mask.device) - ], - dim=1).to(torch.bool) & valid_mask - zeros_mask = (generate_mask == 0) - first_zero_idx = zeros_mask.float().argmax(dim=1) - # Figure out which rows actually contain at least one zero. - rows_with_zero = zeros_mask.any(dim=1) - # Use indexing to set the first zero in each of those rows to 1. - generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1 + target_token_ids_tensor = target_probs.argmax(dim=-1) + accept_mask = (target_token_ids_tensor[:, :-1] == + draft_token_ids_tensor).cumprod(dim=1) - output_token_ids[~generate_mask] = INVALID_TOKEN_ID - return SamplerOutput(sampled_token_ids=output_token_ids, - logprobs_tensors=None) + # Identify valid positions (non-padding). + valid_mask = target_token_ids_tensor != INVALID_TOKEN_ID + # Generate mask with bonus token. + generate_mask = torch.cat([ + accept_mask, + torch.zeros(accept_mask.size(0), 1, device=accept_mask.device) + ], + dim=1).to(torch.bool) & valid_mask + zeros_mask = (generate_mask == 0) + first_zero_idx = zeros_mask.float().argmax(dim=1) + # Figure out which rows actually contain at least one zero. + rows_with_zero = zeros_mask.any(dim=1) + # Use indexing to set the first zero in each of those rows to 1. + generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1 + + output_token_ids = target_token_ids_tensor + output_token_ids[~generate_mask] = INVALID_TOKEN_ID + else: + # Reference: https://arxiv.org/pdf/2211.17192 + # 1. Extract the probabilities of the draft tokens. + # [batch_size, max_spec_len] + batch_size = draft_token_ids_tensor.size(0) + max_spec_len = draft_token_ids_tensor.size(1) + invalid_idx = draft_token_ids_tensor == INVALID_TOKEN_ID + draft_token_ids_tensor[invalid_idx] = 0 + assert draft_probs is not None + draft_token_probs = draft_probs.gather( + dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1) + target_token_probs = target_probs.gather( + dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1) + # Force the probabilities of invalid tokens to inf + # so that they are not accepted. + draft_token_probs[invalid_idx] = float('inf') + + # 2. Generate uniform samples. + # [batch_size, max_spec_len + 1] + uniform_samples = _create_uniform_samples( + sampling_metadata.generators, batch_size, max_spec_len, + target_probs.device) + + # 3. Accept or reject the samples. + # [batch_size, max_spec_len] + # If the draft token probabilities are 0, set them to the smallest + # positive normal value representable by float32. + safe_draft_probs = torch.where(draft_token_probs > 0, + draft_token_probs, + torch.finfo(torch.float32).tiny) + accepted = uniform_samples <= target_token_probs / safe_draft_probs + accept_mask = accepted.cumprod(dim=1) + # Set the token ids to the draft token ids if accepted, otherwise + # set them to INVALID_TOKEN_ID. + accepted_token_ids = (draft_token_ids_tensor * accept_mask + + INVALID_TOKEN_ID * (1 - accept_mask)) + + # 4. Adjust the distribution for the recovered tokens. + # Clamp the bonus probabilities to the smallest positive normal + # value representable by float32. + bonus_prob = torch.clamp(target_probs[:, :-1, :] - draft_probs, + min=torch.finfo(torch.float32).tiny) + normalized_bonus_prob = bonus_prob / bonus_prob.sum(dim=-1, + keepdim=True) + + # 5. Sample recovered token ids. + recovered_token_ids = random_sample( + normalized_bonus_prob, + sampling_metadata.generators).reshape(batch_size, max_spec_len) + + # 6. Get the final output token ids. + # output_token_ids = accepted_token_ids + + # recovered_token_ids + + # bonus_token_id + recovered_bonus_token_ids = torch.cat( + [recovered_token_ids, bonus_token_ids_tensor], dim=1) + # Generate mask with bonus tokens. + generate_mask = torch.cat([ + accept_mask, + torch.zeros(batch_size, 1, device=accept_mask.device) + ], + dim=1).to(torch.bool) + zeros_mask = (generate_mask == 0) + first_zero_idx = zeros_mask.float().argmax(dim=1) + output_token_ids = torch.cat([ + accepted_token_ids, + torch.full((batch_size, 1), + fill_value=INVALID_TOKEN_ID, + device=accept_mask.device) + ], + dim=1) + output_token_ids[torch.arange(batch_size), + first_zero_idx] = recovered_bonus_token_ids[ + torch.arange(batch_size), first_zero_idx] + + return output_token_ids + + def compute_probs(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + sample_lens: list[int]) -> torch.Tensor: + """ + Compute probability distribution from logits based on sampling metadata. + + This function applies temperature scaling to the logits and converts + them to probabilities using softmax. Note that division by + temperature is not performed inplace to preserve the original logits + tensor, which will be used by the original sampler to get bonus tokens. + + Args: + logits: Input logits tensor to be converted to probabilities + sampling_metadata: Metadata containing sampling parameters such + as temperature and whether greedy sampling is used + sample_lens: List of sample lengths used for repeating + temperature values + + Returns: + torch.Tensor: Probability distribution (softmax of scaled logits) + if non-greedy sampling is used, otherwise returns the + original logits + """ + if sampling_metadata.all_greedy: + return logits + assert sampling_metadata.temperature is not None + # We should optimize the following code as + # it will cause CPU -> GPU synchronization. + temperature = torch.repeat_interleave( + sampling_metadata.temperature, + torch.tensor(sample_lens, + device=sampling_metadata.temperature.device)) + temperature = temperature.unsqueeze(dim=1) + logits = logits / temperature + return logits.softmax(dim=-1, dtype=torch.float32) def _create_greedy_token_probs( @@ -199,3 +282,66 @@ def _create_greedy_token_probs( src=valid_mask.unsqueeze(-1).float()) return token_probs + + +def _convert_2d_probs( + probs: torch.Tensor, # [num_total_tokens, vocab_size] + sample_lens: list[int]) -> torch.Tensor: + """ + Converts a 2D tensor of probabilities to a 3D tensor with padding. + [num_total_tokens, vocab_size] -> + [batch_size, max_spec_len + 1, vocab_size] + """ + cumulative_lens = torch.cumsum(torch.tensor(sample_lens, + device=probs.device), + dim=0) + split_indices = cumulative_lens[:-1].tolist() # Exclude last index + + # Split into chunks without loops + chunks = torch.tensor_split(probs, split_indices, dim=0) + + # Pad all sequences to maximum length + padded_probs = pad_sequence(chunks, batch_first=True, padding_value=0.0) + return padded_probs + + +def _create_uniform_samples(seeded_seqs: dict[int, torch.Generator], + batch_size: int, k: int, + device: torch.device) -> torch.Tensor: + """ + Generates a batch of uniform random samples, with optional seeding + for specific sequences. + + This method creates a tensor of shape `(batch_size, k)` filled + with uniform random values in the range [0, 1). If `seeded_seqs` + is provided, the sequences corresponding to specific indices + will be generated using the provided `torch.Generator` for + reproducibility. The other sequences will be generated without + a seed. + + Args: + seeded_seqs : Optional[Dict[int, torch.Generator]] + A dictionary mapping indices in the batch to + `torch.Generator` objects. + batch_size : int + The number of sequences to generate. + k : int + The number of random samples per sequence. + device : torch.device + The device on which to allocate the tensor. + + Returns: + uniform_rand : torch.Tensor + A tensor of shape `(batch_size, k)` containing uniform + random values in the range [0, 1). + """ + + uniform_rand = torch.rand(batch_size, + k, + dtype=torch.float32, + device=device) + # Apply seeded generators only where needed + if seeded_seqs: + for idx, generator in seeded_seqs.items(): + uniform_rand[idx].uniform_(0, 1, generator=generator) + return uniform_rand diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 96f6d807b10ce..d91c057083f31 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -119,14 +119,6 @@ class Sampler(nn.Module): ) return sampled - def compute_probs(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - if sampling_metadata.all_greedy: - return logits - # Apply temperature. This is an in-place op changing logits. - logits = self.apply_temperature(logits, sampling_metadata.temperature) - return logits.softmax(dim=-1, dtype=torch.float32) - def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor: return logits.log_softmax(dim=-1, dtype=torch.float32) diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py new file mode 100644 index 0000000000000..5841401367788 --- /dev/null +++ b/vllm/v1/spec_decode/utils.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: Apache-2.0 +from vllm.v1.sample.ops.topk_topp_sampler import random_sample # noqa +from vllm.v1.worker.gpu_input_batch import InputBatch + + +def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool: + if req_id in input_batch.top_k_reqs or req_id in input_batch.top_p_reqs: + # Spec decode doesn't support top_p/top_k sampling. + return False + elif req_id in input_batch.min_p_reqs: + # Spec decode doesn't support min_p sampling. + return False + elif (req_id in input_batch.frequency_penalties_reqs + or req_id in input_batch.presence_penalties_reqs + or req_id in input_batch.repetition_penalties_reqs): + # Spec decode doesn't support penalties. + return False + elif req_id in input_batch.num_logprobs: + # Spec decode doesn't support logprobs. + return False + + return True diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4059d5b17b71b..2a98bea562dcb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -37,6 +37,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.spec_decode.utils import is_spec_decode_supported from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin @@ -1020,15 +1021,26 @@ class GPUModelRunner(LoRAModelRunnerMixin): sampling_metadata=sampling_metadata, ) else: - target_probs = self.model.sampler.compute_probs( - logits, sampling_metadata) draft_token_ids = [ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) for req_id in self.input_batch.req_ids ] - sampler_output = self.rejection_sampler(draft_token_ids, - target_probs, - sampling_metadata) + sample_lens = [len(tokens) + 1 for tokens in draft_token_ids] + recover_logits_idx = np.cumsum(sample_lens) - 1 + target_probs = self.rejection_sampler.compute_probs( + logits, sampling_metadata, sample_lens) + sampler_output = self.model.sample( + logits=logits[recover_logits_idx, :], + sampling_metadata=sampling_metadata, + ) + bonus_token_ids = sampler_output.sampled_token_ids + output_token_ids = self.rejection_sampler( + draft_token_ids, + None, # draft_probs + bonus_token_ids, + target_probs, + sampling_metadata) + sampler_output.sampled_token_ids = output_token_ids # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. @@ -1075,7 +1087,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): spec_token_ids = None else: spec_token_ids = self.generate_draft_token_ids( - valid_sampled_token_ids) + valid_sampled_token_ids, sampling_metadata) return ModelRunnerOutput( req_ids=self.input_batch.req_ids, @@ -1089,6 +1101,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def generate_draft_token_ids( self, sampled_token_ids: list[list[int]], + sampling_metadata: SamplingMetadata, ) -> list[list[int]]: # TODO(woosuk): Optimize. draft_token_ids: list[list[int]] = [] @@ -1099,6 +1112,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): draft_token_ids.append([]) continue + # Skip requests that require top-p, top-k, etc. + req_id = self.input_batch.req_ids[i] + if not is_spec_decode_supported(req_id, self.input_batch): + draft_token_ids.append([]) + continue + # Add sampled_token_ids to token_ids_cpu. start_idx = self.input_batch.num_tokens_no_spec[i] end_idx = start_idx + num_sampled_ids From b539222d4e81512e0cfa6cf56927a70c3aaca9d2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 14:42:06 +0800 Subject: [PATCH 23/34] [V1] Remove input cache client (#14864) Signed-off-by: DarkLight1337 Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/inputs/preprocess.py | 6 ++ vllm/v1/engine/__init__.py | 2 +- vllm/v1/engine/mm_input_cache.py | 122 +++-------------------------- vllm/v1/engine/processor.py | 80 ++++++------------- vllm/v1/worker/gpu_model_runner.py | 39 ++------- 5 files changed, 48 insertions(+), 201 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index f56cff292b68b..af35e43d825a2 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -379,6 +379,7 @@ class InputPreprocessor: multi_modal_data, mm_processor_kwargs, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) prompt_token_ids = self._tokenize_prompt( @@ -401,6 +402,7 @@ class InputPreprocessor: prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, + return_mm_hashes: bool = False, ) -> SingletonInputs: """Async version of :meth:`_extract_prompt_components`.""" parsed = parse_singleton_prompt(prompt) @@ -431,6 +433,7 @@ class InputPreprocessor: multi_modal_data, mm_processor_kwargs, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) return token_inputs( @@ -452,6 +455,7 @@ class InputPreprocessor: multi_modal_data, mm_processor_kwargs, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) prompt_token_ids = await self._tokenize_prompt_async( @@ -726,6 +730,7 @@ class InputPreprocessor: prompt, request_id=request_id, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) return self._build_decoder_only_llm_inputs( @@ -746,6 +751,7 @@ class InputPreprocessor: prompt, request_id=request_id, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) return self._build_decoder_only_llm_inputs( diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index cd29c2d7d57c0..3699779b3a0fe 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -52,7 +52,7 @@ class EngineCoreRequest( # Detokenizer, but set to None when it is added to EngineCoreClient. prompt: Optional[str] prompt_token_ids: list[int] - mm_inputs: Optional[list[Optional[MultiModalKwargs]]] + mm_inputs: Optional[list[MultiModalKwargs]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] sampling_params: SamplingParams diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index e2dda73ba4299..61a55d2499bd1 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,131 +1,30 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional - -from vllm.config import ModelConfig from vllm.envs import VLLM_MM_INPUT_CACHE_GIB -from vllm.logger import init_logger -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalKwargs, MultiModalRegistry) +from vllm.multimodal import MultiModalKwargs from vllm.multimodal.processing import ProcessingCache -logger = init_logger(__name__) - # The idea of multimodal preprocessing caching is based on having a client and # a server, where the client executes in the frontend process (=P0) and the # server in the core process (=P1). # # -- Client: -# - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs. -# - Perform caching of the generated MultiModalKwargs. -# - This client can be deprecated once all mutimodal models migrate to use -# merged preprocessor with built-in caching functionality. +# - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs +# with built-in caching functionality, with mm_hash as its identifier. # # -- Server: -# - Perform caching of the received MultiModalKwargs. +# - MMInputCacheServer to perform caching of the received MultiModalKwargs. # -# The caching for both client and server is mirrored/similar, and this allows us +# The caching for both client and server is mirrored, and this allows us # to avoid the serialization of "mm_inputs" (like pixel values) between -# client (=P0) and server (=P1) processes. +# client (=P0) and server (=P1) processes if the mm_hash is found in the client +# cache. # Both Client and Server must use the same cache size # (to perform mirrored caching). This cache size is set by the environment # variable VLLM_MM_INPUT_CACHE_GIB. -# TODO(ywang96): Deprecate this class once all multimodal models migrate to use -# merged preprocessor with built-in caching functionality. -class MMInputCacheClient: - - def __init__( - self, - model_config: ModelConfig, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - self.model_config = model_config - self.mm_registry = mm_registry - self.multi_modal_input_mapper = mm_registry.create_input_mapper( - model_config) - self.mm_registry.init_mm_limits_per_prompt(model_config) - - # Init cache - self.use_cache = not model_config.disable_mm_preprocessor_cache - self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, - MultiModalKwargs) - - # DEBUG: Set to None to disable - self.mm_debug_cache_hit_ratio_steps = None - self.mm_debug_cache_hits = 0 - self.mm_debug_cache_total = 0 - - def cache_hit_ratio(self, steps): - total = self.mm_debug_cache_total - - if total > 0 and total % steps == 0: - logger.debug("MMInputMapper: cache_hit_ratio = %.2f ", - self.mm_debug_cache_hits / total) - - # NOTE: process_inputs only supports image inputs since all multimodal - # models with other modalities have migrated to use merged preprocessor. - def process_inputs( - self, - mm_data: MultiModalDataDict, - mm_hashes: Optional[list[str]], - mm_processor_kwargs: Optional[dict[str, Any]], - precomputed_mm_inputs: Optional[list[MultiModalKwargs]], - ) -> list[Optional[MultiModalKwargs]]: - if precomputed_mm_inputs is None: - image_inputs = mm_data["image"] - if not isinstance(image_inputs, list): - image_inputs = [image_inputs] - num_inputs = len(image_inputs) - else: - num_inputs = len(precomputed_mm_inputs) - - # Sanity - if self.use_cache: - assert mm_hashes is not None - assert num_inputs == len(mm_hashes) - - # Process each image input separately, so that later we can schedule - # them in a fine-grained manner. - # Apply caching (if enabled) and reuse precomputed inputs (if provided) - ret_inputs: list[Optional[MultiModalKwargs]] = [] - for input_id in range(num_inputs): - if self.mm_debug_cache_hit_ratio_steps is not None: - self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps) - - mm_input = None - if self.use_cache: - assert mm_hashes is not None - mm_hash = mm_hashes[input_id] - mm_input = self.mm_cache.get(mm_hash) - - self.mm_debug_cache_total += 1 - if mm_input is None: - if precomputed_mm_inputs is not None: - # Reuse precomputed input (for merged preprocessor) - mm_input = precomputed_mm_inputs[input_id] - else: - # Apply legacy input_mapper - mm_input = self.multi_modal_input_mapper( - {"image": [image_inputs[input_id]]}, - mm_processor_kwargs=mm_processor_kwargs, - ) - - if self.use_cache: - # Add to cache - assert mm_hash is not None - self.mm_cache[mm_hash] = mm_input - else: - self.mm_debug_cache_hits += 1 - mm_input = None # Avoids sending mm_input to Server - - ret_inputs.append(mm_input) - - return ret_inputs - - class MMInputCacheServer: def __init__(self, model_config): @@ -135,9 +34,9 @@ class MMInputCacheServer: def get_and_update( self, - mm_inputs: list[Optional[MultiModalKwargs]], + mm_inputs: list[MultiModalKwargs], mm_hashes: list[str], - ) -> list[Optional[MultiModalKwargs]]: + ) -> list[MultiModalKwargs]: assert len(mm_inputs) == len(mm_hashes) if not self.use_cache: @@ -147,8 +46,7 @@ class MMInputCacheServer: for mm_input, mm_hash in zip(mm_inputs, mm_hashes): assert mm_hash is not None if mm_input is None: - mm_input = self.mm_cache.get(mm_hash) - assert mm_input is not None + mm_input = self.mm_cache[mm_hash] else: self.mm_cache[mm_hash] = mm_input diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 663e1e36f7561..4e9e5506bb587 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -11,15 +11,15 @@ from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, from vllm.inputs.parse import is_encoder_decoder_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher, - MultiModalKwargs, MultiModalRegistry) +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, + MultiModalRegistry) +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import merge_and_sort_multimodal_metadata from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.mm_input_cache import MMInputCacheClient from vllm.v1.structured_output.utils import validate_structured_output_request @@ -45,11 +45,6 @@ class Processor: self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer, mm_registry) - self.input_processor = input_registry.create_input_processor( - self.model_config) - - # Multi-modal (huggingface) input mapper - self.mm_input_cache_client = MMInputCacheClient(self.model_config) # Multi-modal hasher (for images) self.use_hash = ( @@ -171,7 +166,7 @@ class Processor: # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. - preprocessed_inputs = self.input_preprocessor.preprocess( + processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, lora_request=lora_request, @@ -180,10 +175,6 @@ class Processor: ) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - # Process prompt and prompt token ids. - # Only applicable to multimodal models with legacy input processor. - processed_inputs = self.input_processor(preprocessed_inputs) - self._validate_model_inputs(processed_inputs, lora_request) if is_encoder_decoder_inputs(processed_inputs): @@ -212,36 +203,22 @@ class Processor: self.tokenizer.get_lora_tokenizer(lora_request)) # Multimodal related. - # Compute MM hashes (if enabled) - mm_hashes = None - if self.use_hash: - # Use mm_hashes from processed inputs if the model has merged - # input processor. - if decoder_inputs.multi_modal_hashes: - mm_hashes = decoder_inputs.multi_modal_hashes - # Fallback to using MultiModalHasher directly. - else: - mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt) + sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None + sorted_mm_positions: Optional[list[PlaceholderRange]] = None + sorted_mm_hashes: Optional[list[str]] = None + if (decoder_mm_inputs := decoder_inputs.multi_modal_data): + assert isinstance(decoder_mm_inputs, MultiModalKwargs) - # For merged preprocessor, mm_data is already mm_inputs - precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None - decoder_mm_data = decoder_inputs.multi_modal_data - if isinstance(decoder_mm_data, MultiModalKwargs): - # The output of merged multi-modal processor (`decoder_mm_data`) + # The output of merged multi-modal processor (`decoder_mm_inputs`) # contains the kwargs for all items from all modalities. # This code separates them so that there is one set of kwargs # per item per modality. - precomputed_mm_inputs = [ + individual_mm_inputs = [ MultiModalKwargs.from_items([item]) - for modality in decoder_mm_data.modalities - for item in decoder_mm_data.get_items(modality) + for modality in decoder_mm_inputs.modalities + for item in decoder_mm_inputs.get_items(modality) ] - mm_positions = decoder_inputs.multi_modal_placeholders - - # Last-mile processing of multimodal metadata and inputs. - if mm_positions: - # Merge and flatten multimodal placeholders, hashes and inputs # from dictionaries to lists, and sort them by each item's position # in the input sequence. @@ -251,14 +228,13 @@ class Processor: sorted_mm_positions, sorted_mm_hashes, ) = merge_and_sort_multimodal_metadata( - mm_positions, - mm_hashes, + decoder_inputs.multi_modal_placeholders, + decoder_inputs.multi_modal_hashes if self.use_hash else None, ) # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple - # modalities involved AND the model supports merged input processor. - if len(sorted_modalities) > 1 and precomputed_mm_inputs: - + # modalities involved. + if len(sorted_modalities) > 1: modality_order_dict = { modality: order for order, modality in enumerate(sorted_modalities) @@ -266,26 +242,16 @@ class Processor: # Sanity check to make sure each multimodal input has only one # modality key. - for mm_input in precomputed_mm_inputs: + for mm_input in individual_mm_inputs: assert len(mm_input.modalities) == 1 - # Sort MultiModalKwags to match sorted_mm_positions - precomputed_mm_inputs = sorted( - precomputed_mm_inputs, + # Sort MultiModalKwargs to match sorted_mm_positions + sorted_mm_inputs = sorted( + individual_mm_inputs, key=lambda mm_input: modality_order_dict[list( mm_input.modalities)[0]]) - - # Apply mm input cache update and legacy input mapper if one exists. - sorted_mm_inputs = self.mm_input_cache_client.process_inputs( - mm_data=decoder_mm_data, - mm_hashes=sorted_mm_hashes, - mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, - precomputed_mm_inputs=precomputed_mm_inputs, - ) - else: - sorted_mm_inputs = None - sorted_mm_hashes = None - sorted_mm_positions = None + else: + sorted_mm_inputs = individual_mm_inputs return EngineCoreRequest( request_id=request_id, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2a98bea562dcb..66015382bfe85 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -29,7 +29,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget -from vllm.v1.engine.mm_input_cache import MMInputCacheClient from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, @@ -133,14 +132,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - if self.is_multimodal_model: - # NOTE: Initialized client is only used for processing dummy - # multimodal data into multimodal kwargs for GPU memory profiling. - # Only applicable to multimodal models with legacy input mapper. - self.mm_input_mapper_profiling = MMInputCacheClient( - self.model_config) - self.mm_input_mapper_profiling.use_cache = False - encoder_compute_budget, encoder_cache_size = compute_encoder_budget( model_config=model_config, scheduler_config=scheduler_config, @@ -1376,32 +1367,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): mm_registry=self.mm_registry, ) dummy_mm_data = dummy_request_data.multi_modal_data + if not isinstance(dummy_mm_data, MultiModalKwargs): + # TODO: Delete this check once input mapper is fully removed. + raise RuntimeError( + "Legacy input mapper is not supported in V1") - # Dummy data definition in V0 may contain multiple multimodal items + # Dummy data definition may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 # they are scheduled to be processed separately. - - # Case when models have a merged processor, their dummy data is - # already batched `MultiModalKwargs`, therefore we take the first - # `MultiModalKwargsItem` from the desired modality to profile on. - if isinstance(dummy_mm_data, MultiModalKwargs): - dummy_mm_item = dummy_mm_data.get_item( - modality=dummy_data_modality, item_index=0) - dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) - - # Case when models have dummy data explicitly defined as - # `MultiModalDataDict`, so they need to be processed through input - # mapper. - # TODO (ywang96): deprecate this path once merged processor is - # supported on all models. - else: - mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs( - mm_data=dummy_mm_data, - mm_hashes=None, - mm_processor_kwargs=None, - precomputed_mm_inputs=None) - dummy_mm_kwargs = mm_kwargs_list[0] + dummy_mm_item = dummy_mm_data.get_item( + modality=dummy_data_modality, item_index=0) + dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch( [dummy_mm_kwargs] * max_num_mm_items) From 9b87a579aaf82338d5304219350932abae9b19ac Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Mon, 17 Mar 2025 16:22:14 +0800 Subject: [PATCH 24/34] [Misc][XPU] Use None as device capacity for XPU (#14932) Signed-off-by: yan ma --- vllm/platforms/xpu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index d99d4ef3dac06..225e756cd7ce8 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -37,10 +37,11 @@ class XPUPlatform(Platform): return "vllm.attention.backends.ipex_attn.IpexAttnBackend" @staticmethod - def get_device_capability(device_id: int = 0) -> DeviceCapability: - major, minor, *_ = torch.xpu.get_device_capability( - device_id)['version'].split('.') - return DeviceCapability(major=int(major), minor=int(minor)) + def get_device_capability( + device_id: int = 0) -> Optional[DeviceCapability]: + # capacity format differs from cuda's and will cause unexpected + # failure, so use None directly + return None @staticmethod def get_device_name(device_id: int = 0) -> str: From dd3b865854c21c99ebc5d1bd34c12936002174c2 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 17 Mar 2025 16:29:36 +0800 Subject: [PATCH 25/34] [Doc] Add vLLM Beijing meetup slide (#14938) Signed-off-by: Chen Zhang --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index bfab7faf598b6..f61b4218e1824 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Twitter/X | Developer Slack |

---- - -We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**! - -Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend). - -👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion! - ---- - *Latest News* 🔥 +- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). From 0a74bfce9cb9e51616c50b007e53400244cbc24a Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Mon, 17 Mar 2025 04:37:42 -0400 Subject: [PATCH 26/34] setup.py: drop assumption about local `main` branch (#14692) Signed-off-by: Russell Bryant --- setup.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index d18fe53f12de1..d412f34b3e3dc 100755 --- a/setup.py +++ b/setup.py @@ -294,26 +294,28 @@ class repackage_wheel(build_ext): ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] - # Check if the local main branch is up-to-date. This is to ensure - # the base commit we found is the most recent commit on the main - # branch. - local_main_commit = subprocess.check_output( - ["git", "rev-parse", "main"]).decode("utf-8").strip() - if local_main_commit != upstream_main_commit: - raise ValueError( - f"Local main branch ({local_main_commit}) is not " - "up-to-date with upstream main branch " - f"({upstream_main_commit}). Please pull the latest " - "changes from upstream main branch first.") + # Check if the upstream_main_commit exists in the local repo + try: + subprocess.check_output( + ["git", "cat-file", "-e", f"{upstream_main_commit}"]) + except subprocess.CalledProcessError: + # If not present, fetch it from the remote repository. + # Note that this does not update any local branches, + # but ensures that this commit ref and its history are + # available in our local repo. + subprocess.check_call([ + "git", "fetch", "https://github.com/vllm-project/vllm", + "main" + ]) # Then get the commit hash of the current branch that is the same as # the upstream main commit. current_branch = subprocess.check_output( ["git", "branch", "--show-current"]).decode("utf-8").strip() - base_commit = subprocess.check_output( - ["git", "merge-base", "main", - current_branch]).decode("utf-8").strip() + base_commit = subprocess.check_output([ + "git", "merge-base", f"{upstream_main_commit}", current_branch + ]).decode("utf-8").strip() return base_commit except ValueError as err: raise ValueError(err) from None From cd0cd85102e4b5971dd44109776942df5cdca70f Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Mon, 17 Mar 2025 01:40:41 -0700 Subject: [PATCH 27/34] [MISC] More AMD unused var clean up (#14926) Signed-off-by: Lu Fang --- csrc/rocm/attention.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 90f0b54d2f006..c500d00ea528e 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) { template __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { - union tmpcvt { + [[maybe_unused]] union tmpcvt { uint16_t u; _Float16 f; __hip_bfloat16 b; @@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { template __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1, const _B16x4& inp2) { - union tmpcvt { + [[maybe_unused]] union tmpcvt { uint16_t u; _Float16 f; __hip_bfloat16 b; @@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int seq_idx = blockIdx.y; const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; - const int laneid = threadIdx.x % WARP_SIZE; + [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS From 69698f257e3a329fd68276459e82e37cd5ae43f2 Mon Sep 17 00:00:00 2001 From: kushanam <42385577+kushanam@users.noreply.github.com> Date: Mon, 17 Mar 2025 01:47:58 -0700 Subject: [PATCH 28/34] fix minor miscalled method (#14327) From b4ad56c1bd2fd39028f64919a11a4c5af96bf0c5 Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Mon, 17 Mar 2025 01:48:28 -0700 Subject: [PATCH 29/34] [V1][TPU] Apply the ragged paged attention kernel fix and remove the padding. (#14846) Signed-off-by: Xiongfei Wei --- requirements/tpu.txt | 12 ++++++------ vllm/v1/worker/tpu_model_runner.py | 7 ++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 97a39bcd4a6d6..7246fc19bfa97 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -17,9 +17,9 @@ ray[data] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index effcac7e7bdef..00869467be341 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -23,8 +23,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available -from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK, - PallasAttentionBackend, +from vllm.v1.attention.backends.pallas import (PallasAttentionBackend, PallasMetadata) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -139,10 +138,8 @@ class TPUModelRunner: device="cpu") self.slot_mapping_np = self.slot_mapping_cpu.numpy() - padded_max_num_blocks_per_req = _get_padded_number( - self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK) self.block_table_cpu = torch.zeros( - (self.max_num_tokens, padded_max_num_blocks_per_req), + (self.max_num_tokens, self.max_num_blocks_per_req), dtype=self.input_batch.block_table.get_cpu_tensor().dtype, device="cpu") From 868a8c5b2c8c042fc869eb30bce29fb8e19d979e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 17:15:20 +0800 Subject: [PATCH 30/34] [Bugfix] Fix Ultravox on V1 (#14929) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/ultravox.py | 42 +++++++++++++++----------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index d368c145d55f9..cb1e143838496 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -5,7 +5,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.utils.checkpoint @@ -36,7 +36,7 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig from .interfaces import (MultiModalEmbeddings, SupportsLoRA, - SupportsMultiModal, SupportsPP, SupportsV0Only) + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings, @@ -50,14 +50,14 @@ _MAX_ENCODER_BATCH_SIZE = 16 class UltravoxAudioFeatureInputs(TypedDict): type: Literal["audio_features"] - data: NestedTensors + data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]] """Shape: `(batch_size, num_chunks, 80, M)`""" - lens: NestedTensors + lens: Union[torch.Tensor, list[torch.Tensor]] """ Length of the audio frames. Used for attention mask in WhisperEncoder. Shape: `(batch_size, num_chunks)` """ - token_len: NestedTensors + token_len: Union[torch.Tensor, list[torch.Tensor]] """ Length of the audio tokens. Used for flattening the audio features. Shape: `(batch_size, num_chunks)` @@ -405,8 +405,7 @@ class ModifiedWhisperEncoder(WhisperEncoder): UltravoxMultiModalProcessor, info=UltravoxProcessingInfo, dummy_inputs=UltravoxDummyInputsBuilder) -class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, - SupportsV0Only): +class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], @@ -506,6 +505,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, if not isinstance(audio_features, (torch.Tensor, list)): raise ValueError("Incorrect type of audio features. " f"Got type: {type(audio_features)}") + if not isinstance(audio_lens, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio_lens. " + f"Got type: {type(audio_features)}") + if not isinstance(audio_token_len, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio_token_len. " + f"Got type: {type(audio_features)}") return UltravoxAudioFeatureInputs(type="audio_features", data=audio_features, @@ -523,7 +528,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, raise AssertionError("This line should be unreachable.") def _process_audio_input( - self, audio_input: UltravoxAudioInputs) -> NestedTensors: + self, + audio_input: UltravoxAudioInputs, + ) -> Union[NestedTensors, tuple[torch.Tensor, ...]]: if audio_input["type"] == "audio_embeds": return audio_input["data"] @@ -531,13 +538,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)] audio_features = pad_and_concat_to_dim3(audio_input["data"]) - if isinstance(audio_input['lens'], list): - # [B1, B2] -> [B1+B2] - audio_lens = torch.cat(audio_input['lens']) - audio_token_len = torch.cat(audio_input['token_len']) - else: - audio_lens = flatten_bn(audio_input['lens']) - audio_token_len = flatten_bn(audio_input['token_len']) + # [B1, B2] -> [B1+B2] + audio_lens = flatten_bn(audio_input['lens'], concat=True) + audio_token_len = flatten_bn(audio_input['token_len'], concat=True) embeddings = self._audio_features_to_embeddings( audio_features, audio_lens) @@ -554,7 +557,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, # Apply mask and flatten flattened_embeddings = embeddings[mask] - return flattened_embeddings + # Return one tensor per input audio + embed_lens = [ + token_len_item.sum().item() + for token_len_item in audio_input['token_len'] + ] + return flattened_embeddings.split(embed_lens) def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: @@ -646,7 +654,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, def pad_and_concat_to_dim3( - features: Union[torch.Tensor, List[torch.Tensor], List[List[torch.Tensor]]] + features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]] ) -> torch.Tensor: """ Pad and concatenate a list of tensors. From 6eaf1e5c52d5e72a577ad03d378a28b39f0e849e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 18:00:17 +0800 Subject: [PATCH 31/34] [Misc] Add `--seed` option to offline multi-modal examples (#14934) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 7 +- examples/offline_inference/audio_language.py | 132 +++-- .../encoder_decoder_multimodal.py | 48 +- examples/offline_inference/vision_language.py | 455 ++++++++++++------ .../vision_language_embedding.py | 31 +- .../vision_language_multi_image.py | 179 ++++--- 6 files changed, 537 insertions(+), 315 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f85572e7c234c..f5be8dca05f1d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -226,10 +226,13 @@ steps: - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/vision_language.py - - python3 offline_inference/vision_language_multi_image.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_embedding.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 293b9fddac89e..02dbdcb64232f 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os +from dataclasses import asdict +from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser @@ -23,21 +25,31 @@ question_per_audio_count = { 2: "What sport and what nursery rhyme are referenced?" } + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompt: str + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # MiniCPM-O -def run_minicpmo(question: str, audio_count: int): +def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: model_name = "openbmb/MiniCPM-o-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - llm = LLM(model=model_name, - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] @@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int): tokenize=False, add_generation_prompt=True, chat_template=audio_chat_template) - return llm, prompt, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + stop_token_ids=stop_token_ids, + ) # Phi-4-multimodal-instruct -def run_phi4mm(questions: str, audio_count: int): +def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: """ Phi-4-multimodal-instruct supports both image and audio inputs. Here, we show how to process audio inputs. @@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int): speech_lora_path = os.path.join(model_path, "speech-lora") placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)]) - prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>" + prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" - llm = LLM( + engine_args = EngineArgs( model=model_path, trust_remote_code=True, max_model_len=4096, @@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int): lora_extra_vocab_size=0, limit_mm_per_prompt={"audio": audio_count}, ) - lora_request = LoRARequest("speech", 1, speech_lora_path) - # To maintain code compatibility in this script, we add LoRA here. - llm.llm_engine.add_lora(lora_request=lora_request) - # You can also add LoRA using: - # llm.generate(prompts, lora_request=lora_request,...) - stop_token_ids = None - return llm, prompts, stop_token_ids + return ModelRequestData( + engine_args=engine_args, + prompt=prompts, + lora_requests=[LoRARequest("speech", 1, speech_lora_path)], + ) # Qwen2-Audio -def run_qwen2_audio(question: str, audio_count: int): +def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: model_name = "Qwen/Qwen2-Audio-7B-Instruct" - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) audio_in_prompt = "".join([ f"Audio {idx+1}: " @@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int): "<|im_start|>user\n" f"{audio_in_prompt}{question}<|im_end|>\n" "<|im_start|>assistant\n") - stop_token_ids = None - return llm, prompt, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) # Ultravox 0.5-1B -def run_ultravox(question: str, audio_count: int): +def run_ultravox(question: str, audio_count: int) -> ModelRequestData: model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int): tokenize=False, add_generation_prompt=True) - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=5, - trust_remote_code=True, - limit_mm_per_prompt={"audio": audio_count}) - stop_token_ids = None - return llm, prompt, stop_token_ids + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + trust_remote_code=True, + limit_mm_per_prompt={"audio": audio_count}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) # Whisper -def run_whisper(question: str, audio_count: int): +def run_whisper(question: str, audio_count: int) -> ModelRequestData: assert audio_count == 1, ( "Whisper only support single audio input per prompt") model_name = "openai/whisper-large-v3-turbo" prompt = "<|startoftranscript|>" - llm = LLM(model=model_name, - max_model_len=448, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) - stop_token_ids = None - return llm, prompt, stop_token_ids + engine_args = EngineArgs( + model=model_name, + max_model_len=448, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) model_example_map = { @@ -164,14 +194,24 @@ def main(args): raise ValueError(f"Model type {model} is not supported.") audio_count = args.num_audios - llm, prompt, stop_token_ids = model_example_map[model]( - question_per_audio_count[audio_count], audio_count) + req_data = model_example_map[model](question_per_audio_count[audio_count], + audio_count) + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + # To maintain code compatibility in this script, we add LoRA here. + # You can also add LoRA using: + # llm.generate(prompts, lora_request=lora_request,...) + if req_data.lora_requests: + for lora_request in req_data.lora_requests: + llm.llm_engine.add_lora(lora_request=lora_request) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. sampling_params = SamplingParams(temperature=0.2, max_tokens=64, - stop_token_ids=stop_token_ids) + stop_token_ids=req_data.stop_token_ids) mm_data = {} if audio_count > 0: @@ -183,7 +223,7 @@ def main(args): } assert args.num_prompts > 0 - inputs = {"prompt": prompt, "multi_modal_data": mm_data} + inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} if args.num_prompts > 1: # Batch inference inputs = [inputs] * args.num_prompts @@ -214,6 +254,10 @@ if __name__ == "__main__": default=1, choices=[0, 1, 2], help="Number of audio items per prompt.") + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") args = parser.parse_args() main(args) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index f44bc423658ec..6d0c3ac1ee09a 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. """ import time +from collections.abc import Sequence +from dataclasses import asdict +from typing import NamedTuple -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.utils import FlexibleArgumentParser +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: Sequence[PromptType] + + def run_florence2(): - # Create a Florence-2 encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="microsoft/Florence-2-large", tokenizer="facebook/bart-large", max_num_seqs=8, @@ -39,12 +46,15 @@ def run_florence2(): "decoder_prompt": "", }, ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) def run_mllama(): - # Create a Mllama encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="meta-llama/Llama-3.2-11B-Vision-Instruct", max_model_len=4096, max_num_seqs=2, @@ -69,12 +79,15 @@ def run_mllama(): "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 }, ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) def run_whisper(): - # Create a Whisper encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="openai/whisper-large-v3-turbo", max_model_len=448, max_num_seqs=16, @@ -99,7 +112,11 @@ def run_whisper(): "decoder_prompt": "<|startoftranscript|>", } ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) model_example_map = { @@ -114,7 +131,12 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") - llm, prompts = model_example_map[model]() + req_data = model_example_map[model]() + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + prompts = req_data.prompts # Create a sampling params object. sampling_params = SamplingParams( @@ -153,6 +175,10 @@ if __name__ == "__main__": default="mllama", choices=model_example_map.keys(), help='Huggingface "model_type".') + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") args = parser.parse_args() main(args) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 432cda5e24396..58fd5e53bf8dc 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -8,122 +8,164 @@ on HuggingFace model repository. """ import os import random +from dataclasses import asdict +from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: list[str] + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Aria -def run_aria(questions: list[str], modality: str): +def run_aria(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "rhymes-ai/Aria" # NOTE: Need L40 (or equivalent) to avoid OOM - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=2, - dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + dtype="bfloat16", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) prompts = [(f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") for question in questions] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # BLIP-2 -def run_blip2(questions: list[str], modality: str): +def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" # BLIP-2 prompt format is inaccurate on HuggingFace model repository. # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompts = [f"Question: {question} Answer:" for question in questions] - llm = LLM(model="Salesforce/blip2-opt-2.7b", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="Salesforce/blip2-opt-2.7b", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Chameleon -def run_chameleon(questions: list[str], modality: str): +def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}" for question in questions] - llm = LLM(model="facebook/chameleon-7b", - max_model_len=4096, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="facebook/chameleon-7b", + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Deepseek-VL2 -def run_deepseek_vl2(questions: list[str], modality: str): +def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "deepseek-ai/deepseek-vl2-tiny" - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, - hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, + ) prompts = [ f"<|User|>: \n{question}\n\n<|Assistant|>:" for question in questions ] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Florence2 -def run_florence2(question: str, modality: str): +def run_florence2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - llm = LLM(model="microsoft/Florence-2-large", - tokenizer="facebook/bart-large", - max_num_seqs=8, - trust_remote_code=True, - dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model="microsoft/Florence-2-large", + tokenizer="facebook/bart-large", + max_num_seqs=8, + trust_remote_code=True, + dtype="bfloat16", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) - prompt = "" - stop_token_ids = None - return llm, prompt, stop_token_ids + prompts = ["" for _ in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Fuyu -def run_fuyu(questions: list[str], modality: str): +def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}\n" for question in questions] - llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Gemma 3 -def run_gemma3(questions: list[str], modality: str): +def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3-4b-it" - llm = LLM( + engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, @@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str): prompts = [("user\n" f"{question}\n" "model\n") for question in questions] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # GLM-4v -def run_glm4v(questions: list[str], modality: str): +def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "THUDM/glm-4v-9b" - llm = LLM(model=model_name, - max_model_len=2048, - max_num_seqs=2, - trust_remote_code=True, - enforce_eager=True, - hf_overrides={"architectures": ["GLM4VForCausalLM"]}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model=model_name, + max_model_len=2048, + max_num_seqs=2, + trust_remote_code=True, + enforce_eager=True, + hf_overrides={"architectures": ["GLM4VForCausalLM"]}, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) prompts = [ f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ @@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str): ] stop_token_ids = [151329, 151336, 151338] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # H2OVL-Mississippi -def run_h2ovl(questions: list[str], modality: str): +def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "h2oai/h2ovl-mississippi-800m" - llm = LLM( + engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, @@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str): # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m stop_token_ids = [tokenizer.eos_token_id] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # Idefics3-8B-Llama3 -def run_idefics3(questions: list[str], modality: str): +def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "HuggingFaceM4/Idefics3-8B-Llama3" - llm = LLM( + engine_args = EngineArgs( model=model_name, max_model_len=8192, max_num_seqs=2, @@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str): prompts = [( f"<|begin_of_text|>User:{question}\nAssistant:" ) for question in questions] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # InternVL -def run_internvl(questions: list[str], modality: str): +def run_internvl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "OpenGVLab/InternVL2-2B" - llm = LLM( + engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=4096, @@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str): # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # LLaVA-1.5 -def run_llava(questions: list[str], modality: str): +def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [ f"USER: \n{question}\nASSISTANT:" for question in questions ] - llm = LLM(model="llava-hf/llava-1.5-7b-hf", - max_model_len=4096, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="llava-hf/llava-1.5-7b-hf", + max_model_len=4096, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(questions: list[str], modality: str): +def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"[INST] \n{question} [/INST]" for question in questions] - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", - max_model_len=8192, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="llava-hf/llava-v1.6-mistral-7b-hf", + max_model_len=8192, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(questions: list[str], modality: str): +def run_llava_next_video(questions: list[str], + modality: str) -> ModelRequestData: assert modality == "video" prompts = [ f"USER: