vllm/tests/kernels/moe/modular_kernel_tools/cli_args.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import argparse

import torch

import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig

from .common import Config
from .mk_objects import (
    MK_ALL_PREPARE_FINALIZE_TYPES,
    MK_FUSED_EXPERT_TYPES,
    MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES,
)


def make_config_arg_parser(description: str):
    def to_pf_class_type(s: str) -> mk.FusedMoEPrepareAndFinalize:
        for pf in MK_ALL_PREPARE_FINALIZE_TYPES:
            if pf.__name__ == s:
                return pf
        raise ValueError(f"Cannot find a PrepareFinalize type that matches {s}")

    def to_experts_class_type(s: str) -> mk.FusedMoEPermuteExpertsUnpermute:
        for fe in MK_FUSED_EXPERT_TYPES:
            if fe.__name__ == s:
                return fe
        raise ValueError(f"Cannot find a FusedExperts type that matches {s}")

    def to_quant_torch_dtype(s: str) -> torch.dtype:
        if s == "torch.float8_e4m3fn":
            return torch.float8_e4m3fn
        raise ValueError(f"Unsupported quant type {s}")

    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        "--world-size",
        type=int,
        default=2,
        help="Number of ranks that participate in all2all",
    )
    parser.add_argument(
        "--pf-type",
        type=to_pf_class_type,
        required=True,
        help=(
            "Choose a PrepareFinalize Type : "
            f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"
        ),
    )
    parser.add_argument(
        "--experts-type",
        type=to_experts_class_type,
        required=True,
        help=(
            f"Choose a FusedExpert type : {[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"
        ),
    )
    parser.add_argument(
        "-m",
        nargs="+",
        type=int,
        default=[64],
        help="num tokens per rank",
    )
    parser.add_argument(
        "-k",
        type=int,
        default=7168,
        help="hidden-size",
    )
    parser.add_argument(
        "-n",
        type=int,
        default=1024,
        help="N dimension of the first fused-moe matmul",
    )
    parser.add_argument(
        "--num-experts", type=int, default=32, help="Global num experts"
    )
    parser.add_argument("--topk", nargs="+", type=int, default=[4, 1], help="num topk")
    parser.add_argument(
        "--fused-moe-chunk-size",
        type=int,
        help="Fused moe chunk size used for the non-batched fused experts impl.",
    )

    # Quant args
    parser.add_argument(
        "--quant-dtype", type=to_quant_torch_dtype, help="Quant datatype"
    )
    parser.add_argument(
        "--per-token-quantized-activations",
        action="store_true",
        help=("The input activations must be per-token quantized"),
    )
    parser.add_argument(
        "--per-channel-quantized-weights",
        action="store_true",
        help="The weights must be per-channel quantized.",
    )
    parser.add_argument(
        "--block-shape", nargs="+", type=int, help="Quantization block shape"
    )

    # Torch trace profile generation args
    parser.add_argument(
        "--torch-trace-dir-path",
        type=str,
        default=None,
        help="Get torch trace for single execution",
    )

    return parser


def _validate_args(args: argparse.Namespace):
    if args.quant_dtype is not None:
        assert args.quant_dtype == torch.float8_e4m3fn
        if args.block_shape is not None:
            assert len(args.block_shape) == 2, (
                f"block shape must have 2 elements. got {args.block_shape}"
            )

    if args.experts_type in MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES:
        assert args.world_size == 1, "Single GPU objects need world size set to 1"

    if args.torch_trace_dir_path is not None:
        from pathlib import Path

        assert Path(args.torch_trace_dir_path).is_dir(), (
            f"Please create {args.torch_trace_dir_path}"
        )


def make_config(args: argparse.Namespace) -> Config:
    _validate_args(args)

    quant_config = None
    if args.quant_dtype is not None:
        quant_config = FusedMoEQuantConfig(
            quant_dtype=args.quant_dtype,
            per_act_token_quant=args.per_token_quantized_activations,
            per_out_ch_quant=args.per_channel_quantized_weights,
            block_shape=args.block_shape,
        )

    return Config(
        Ms=args.m,
        K=args.k,
        N=args.n,
        E=args.num_experts,
        topks=args.topk,
        dtype=torch.bfloat16,  # hard-code
        quant_config=quant_config,
        prepare_finalize_type=args.pf_type,
        fused_experts_type=args.experts_type,
        fused_moe_chunk_size=args.fused_moe_chunk_size,
        world_size=args.world_size,
        torch_trace_dir_path=args.torch_trace_dir_path,
    )