[Chore]:Extract math and argparse utilities to separate modules (#27188)

Signed-off-by: Yeshwanth Surya <yeshsurya@gmail.com>
Signed-off-by: Yeshwanth N <yeshsurya@gmail.com>
Signed-off-by: yeshsurya <yeshsurya@gmail.com>
This commit is contained in:
Yeshwanth N 2025-10-26 16:33:32 +05:30 committed by GitHub
parent 8fb7b2fab9
commit 71b1c8b667
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
125 changed files with 716 additions and 640 deletions

View File

@ -5,7 +5,7 @@ import gc
from benchmark_utils import TimeCollector
from tabulate import tabulate
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool

View File

@ -46,7 +46,7 @@ import time
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):

View File

@ -19,7 +19,7 @@ from vllm.config import (
VllmConfig,
)
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner

View File

@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
from vllm.transformers_utils.tokenizer import get_tokenizer

View File

@ -11,7 +11,7 @@ import time
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Select a equi-probable random priority

View File

@ -51,7 +51,7 @@ except ImportError:
from backend_request_func import get_tokenizer
try:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser

View File

@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]

View File

@ -18,7 +18,8 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_triton_block_scaled_mm,
)
from vllm.utils import FlexibleArgumentParser, cdiv
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.math_utils import cdiv
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]

View File

@ -10,7 +10,7 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

View File

@ -10,7 +10,7 @@ import vllm.model_executor.layers.activation # noqa F401
from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
batch_size_range = [1, 16, 32, 64, 128]

View File

@ -28,7 +28,7 @@ except ImportError as e:
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
parser = FlexibleArgumentParser(
description="Benchmark BitBLAS int4 on a specific target."

View File

@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
WEIGHT_SHAPES_MOE = {
"nvidia/DeepSeek-R1-FP4": [

View File

@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]

View File

@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
)
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__)

View File

@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
fused_topk,
)
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = [
"nm-testing/Mixtral-8x7B-Instruct-v0.1",

View File

@ -7,7 +7,7 @@ import torch
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

View File

@ -25,7 +25,7 @@ if HAS_TRITON:
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_TP_SIZES = [1]

View File

@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights,
)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]

View File

@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
sort_weights,
)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]

View File

@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
FP8_DTYPE = current_platform.fp8_dtype()

View File

@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
)
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
FP8_DTYPE = current_platform.fp8_dtype()

View File

@ -39,7 +39,7 @@ import torch
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

View File

@ -9,7 +9,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random,

View File

@ -7,7 +7,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

View File

@ -9,7 +9,7 @@ from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random,

View File

@ -12,7 +12,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random_flash,

View File

@ -8,7 +8,7 @@ import torch
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def benchmark_rope_kernels_multi_lora(

View File

@ -8,7 +8,7 @@ from datetime import datetime
import flashinfer
import torch
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
FP8_DTYPE = torch.float8_e4m3fn

View File

@ -8,7 +8,7 @@ from datetime import datetime
import flashinfer
import torch
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
FP8_DTYPE = torch.float8_e4m3fn

View File

@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)

View File

@ -11,7 +11,7 @@ import regex as re
import seaborn as sns
from torch.utils.benchmark import Measurement as TMeasurement
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
if __name__ == "__main__":
parser = FlexibleArgumentParser(

View File

@ -5,7 +5,7 @@ import cProfile
import pstats
from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
# A very long prompt, total number of tokens is about 15k.
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000

View File

@ -18,7 +18,7 @@ from transformers import AutoTokenizer
from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
question_per_audio_count = {

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def create_parser():

View File

@ -4,7 +4,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -4,7 +4,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def create_parser():

View File

@ -4,7 +4,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -4,7 +4,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -13,7 +13,7 @@ from typing import NamedTuple
from vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):

View File

@ -8,7 +8,7 @@ for processing prompts with various sampling parameters.
import argparse
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def create_test_prompts() -> list[tuple[str, SamplingParams]]:

View File

@ -25,7 +25,7 @@ python load_sharded_state.py \
import dataclasses
from vllm import LLM, EngineArgs, SamplingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -4,7 +4,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -4,7 +4,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs, PoolingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -4,7 +4,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -5,7 +5,7 @@
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -13,7 +13,7 @@ from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))

View File

@ -13,7 +13,7 @@ from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
class QueryResult(NamedTuple):

View File

@ -30,7 +30,7 @@ from pathlib import Path
from vllm import LLM, EngineArgs
from vllm.model_executor.model_loader import ShardedStateLoader
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():

View File

@ -9,7 +9,7 @@ from vllm.inputs import TokensPrompt
from vllm.v1.metrics.reader import Counter, Vector
try:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser

View File

@ -22,7 +22,7 @@ from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):

View File

@ -18,7 +18,7 @@ from transformers import AutoProcessor, AutoTokenizer
from vllm import LLM, EngineArgs, SamplingParams
from vllm.lora.request import LoRARequest
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
QUESTION = "What is the content of each image?"
IMAGE_URLS = [

View File

@ -18,7 +18,7 @@ from PIL.Image import Image
from vllm import LLM, EngineArgs
from vllm.entrypoints.score_utils import ScoreMultiModalParam
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
ROOT_DIR = Path(__file__).parent.parent.parent
EXAMPLES_DIR = ROOT_DIR / "examples"

View File

@ -26,7 +26,7 @@ import requests
from openai import OpenAI
from utils import get_first_model
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"

View File

@ -16,7 +16,7 @@ from vllm.model_executor.model_loader.tensorizer import (
tensorize_vllm_model,
tensorizer_kwargs_arg,
)
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = logging.getLogger()

View File

@ -22,7 +22,7 @@ from vllm.engine.arg_utils import (
optional_type,
parse_type,
)
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
@pytest.mark.parametrize(

View File

@ -7,7 +7,7 @@ import pytest
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from ...utils import VLLM_PATH

View File

@ -6,7 +6,6 @@ import pytest
import torch
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.deep_gemm import (
_ceil_to_ue8m0,
calc_diff,
@ -16,6 +15,7 @@ from vllm.utils.deep_gemm import (
get_paged_mqa_logits_metadata,
)
from vllm.utils.import_utils import has_deep_gemm
from vllm.utils.math_utils import cdiv
def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor:

View File

@ -10,7 +10,7 @@ from tests.kernels.quantization.nvfp4_utils import (
get_nvfp4_global_scale,
)
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
if not current_platform.is_device_capability(100):
pytest.skip(

View File

@ -7,7 +7,7 @@ from torch import Tensor
import vllm._custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
def ref_mla(

View File

@ -5,7 +5,7 @@ import pytest
import torch
from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
@pytest.mark.parametrize("B", [3, 5])

View File

@ -13,8 +13,8 @@ from tests.kernels.moe.utils import per_token_cast_to_fp8
from tests.kernels.utils import baseline_scaled_mm
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.deep_gemm import per_block_cast_to_fp8
from vllm.utils.math_utils import cdiv
@pytest.mark.parametrize(

View File

@ -27,7 +27,7 @@ from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
triton_kernel_moe_forward,
)
from vllm.model_executor.layers.utils import shuffle_weight
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
def deshuffle(w: torch.Tensor):

View File

@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size,
)
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
NUM_TOKENS = [1, 3, 256, 2256, 4096]
NUM_EXPERTS = [32, 160, 256, 257]

View File

@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExper
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch

View File

@ -45,7 +45,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
)
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch

View File

@ -8,7 +8,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
fp8_dtype = torch.float8_e4m3fn

View File

@ -16,8 +16,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
)
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.utils import round_up
from vllm.utils.deep_gemm import per_block_cast_to_fp8
from vllm.utils.math_utils import round_up
def triton_moe(

View File

@ -6,7 +6,7 @@ import torch
from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.

View File

@ -13,7 +13,7 @@ import torch
from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
MNK_FACTORS = [
(1, 256, 128),

View File

@ -18,7 +18,7 @@ from tests.v1.attention.utils import (
from vllm.attention.backends.registry import _Backend
from vllm.config import ModelConfig
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,

View File

@ -22,7 +22,7 @@ from vllm import _custom_ops as ops
from vllm.attention.backends.registry import _Backend
from vllm.attention.ops.flashmla import is_flashmla_dense_supported
from vllm.config.vllm import set_current_vllm_config
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec

View File

@ -23,7 +23,7 @@ from tests.v1.attention.utils import (
from vllm import _custom_ops as ops
from vllm.attention.ops import flashmla
from vllm.model_executor.layers.linear import ColumnParallelLinear
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend
from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks

View File

@ -8,7 +8,7 @@ import pytest
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def test_prefix_caching_from_cli():

View File

@ -16,7 +16,7 @@ from vllm.attention.layer import Attention
from vllm.attention.selector import get_attn_backend
from vllm.config import CacheConfig, VllmConfig
from vllm.logger import init_logger
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
subclass_attention_backend,

View File

@ -7,7 +7,7 @@ import jax
from jax.experimental import pallas as pl
from jax.experimental.pallas import tpu as pltpu
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
def _kv_cache_update_kernel(

View File

@ -6,7 +6,7 @@ import torch
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
FP8_DTYPE = current_platform.fp8_dtype()

View File

@ -58,7 +58,7 @@ except ImportError:
librosa = PlaceholderModule("librosa")
try:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser

View File

@ -19,7 +19,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
kStaticTensorScale,
)
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
from .fx_utils import is_func

View File

@ -82,7 +82,8 @@ from vllm.transformers_utils.config import (
maybe_override_with_speculators,
)
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import FlexibleArgumentParser, is_in_ray_actor
from vllm.utils import is_in_ray_actor
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_ip
from vllm.v1.sample.logits_processor import LogitsProcessor

View File

@ -26,7 +26,8 @@ from vllm.entrypoints.utils import with_cancellation
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
from vllm.utils import random_uuid, set_ulimit
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger("vllm.entrypoints.api_server")

View File

@ -9,7 +9,7 @@ from vllm.entrypoints.cli.types import CLISubcommand
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
if typing.TYPE_CHECKING:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
else:
FlexibleArgumentParser = argparse.ArgumentParser

View File

@ -8,7 +8,7 @@ from vllm.collect_env import main as collect_env_main
from vllm.entrypoints.cli.types import CLISubcommand
if typing.TYPE_CHECKING:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
else:
FlexibleArgumentParser = argparse.ArgumentParser

View File

@ -20,7 +20,7 @@ def main():
import vllm.entrypoints.cli.run_batch
import vllm.entrypoints.cli.serve
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
CMD_MODULES = [
vllm.entrypoints.cli.openai,

View File

@ -13,7 +13,7 @@ from openai.types.chat import ChatCompletionMessageParam
from vllm.entrypoints.cli.types import CLISubcommand
if TYPE_CHECKING:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
else:
FlexibleArgumentParser = argparse.ArgumentParser

View File

@ -11,7 +11,7 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.logger import init_logger
if typing.TYPE_CHECKING:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
else:
FlexibleArgumentParser = argparse.ArgumentParser

View File

@ -5,7 +5,7 @@ import argparse
import typing
if typing.TYPE_CHECKING:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
else:
FlexibleArgumentParser = argparse.ArgumentParser

View File

@ -29,7 +29,7 @@ from vllm.entrypoints.constants import (
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__)

View File

@ -32,7 +32,8 @@ from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingM
from vllm.entrypoints.openai.serving_score import ServingScores
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.utils import random_uuid
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)

View File

@ -31,7 +31,7 @@ from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.transformers_utils.tokenizers import MistralTokenizer
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__)

View File

@ -21,7 +21,7 @@ import torch.nn.functional as F
from einops import rearrange
from vllm.triton_utils import tl, triton
from vllm.utils import cdiv, next_power_of_2
from vllm.utils.math_utils import cdiv, next_power_of_2
from .utils import input_guard

View File

@ -14,9 +14,9 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
OCP_MX_Scheme,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.utils import cdiv
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_triton_kernels
from vllm.utils.math_utils import cdiv
logger = init_logger(__name__)

View File

@ -10,8 +10,8 @@ import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
from vllm.triton_utils import tl, triton
from vllm.utils import round_up
from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout
from vllm.utils.math_utils import round_up
def expert_num_tokens_round_up_and_sum(

View File

@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
)
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
from vllm.v1.worker.ubatching import (
dbo_current_ubatch_id,
dbo_enabled,

View File

@ -55,9 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.platforms.interface import CpuArchEnum
from vllm.utils import cdiv, round_up
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_deep_ep, has_pplx
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import current_stream, direct_register_custom_op
from vllm.v1.worker.ubatching import dbo_current_ubatch_id

View File

@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.utils import (
count_expert_num_tokens,
disable_inplace,
)
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.v1.worker.ubatching import (
dbo_current_ubatch_id,
dbo_enabled,

View File

@ -5,7 +5,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.triton_utils import triton
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
def moe_align_block_size(

View File

@ -15,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import (
_validate_scale_shape,
moe_kernel_quantize_input,
)
from vllm.utils import cdiv, round_up
from vllm.utils.math_utils import cdiv, round_up
logger = init_logger(__name__)

View File

@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization.utils.mxfp8_utils import (
mxfp8_e4m3_quantize,
)
from vllm.triton_utils import tl, triton
from vllm.utils import cdiv
from vllm.utils.flashinfer import flashinfer_fp4_quantize
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import is_torch_equal_or_newer

View File

@ -26,7 +26,7 @@ from vllm.config import ModelConfig, ParallelConfig, VllmConfig, set_current_vll
from vllm.logger import init_logger
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.import_utils import PlaceholderModule
if TYPE_CHECKING:

Some files were not shown because too many files have changed in this diff Show More