[MISC] cudagraph_capture_sizes related improvements (#26016)

Signed-off-by: fhl <2410591650@qq.com> Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-12-14 07:15:01 +08:00 · 2025-10-24 20:11:05 +08:00 · 2025-10-24 20:11:05 +08:00 · 284cc92275
commit 284cc92275
parent 435be10db9
14 changed files with 303 additions and 110 deletions
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 from contextlib import nullcontext
 import pytest
@ -8,6 +9,8 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.config.compilation import CompilationMode
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
@ -233,3 +236,73 @@ def test_resolve_operator_overload():
    assert len(resolved) == 2  # Only 2 valid ops
    assert resolved[0] is torch.ops.aten.mm.default
    assert resolved[1] is torch.ops.aten.addmm.default
@pytest.mark.skipif(
    not current_platform.support_static_graph_mode(),
    reason="Skip if not cudagraph mode supported",
 )
@pytest.mark.parametrize(
    (
        "cudagraph_capture_sizes",
        "max_cudagraph_capture_size",
        "tp_size",
        "enable_sequence_parallelism",
        "max_num_batched_tokens",
        "use_cudagraph",
        "expected_max_size",
    ),
    [
        (None, None, 1, False, 2048, True, 512),
        ([1, 2, 4], 4, 1, False, 2048, True, 4),
        ([1, 2, 4], 8, 1, False, 2048, True, RuntimeError),
        ([1, 256], None, 1, False, 2048, 256),
        ([], None, 1, False, 2048, False, 0),
        (None, 0, 1, False, 2048, False, 0),
        # truncated to nearest multiple of 8 or 16
        (None, 257, 1, False, 2048, True, 256),
        ([1, 2, 4, 15], None, 1, False, 2048, True, 15),  # max from list
        ([1, 2, 4, 15], None, 2, True, 2048, True, 4),  # filtered out 15 due to SP
        ([1, 2, 4, 15], None, 1, False, 8, True, 4),  # limited by the max_tokens
        # the list should contain at least 1 element when use cudagraph
        ([], None, 1, False, 2048, True, RuntimeError),
        # the max capturing size should be >= 1 when use cudagraph
        (None, 0, 1, False, 2048, True, RuntimeError),
    ],
 )
 def test_cudagraph_sizes_post_init(
    cudagraph_capture_sizes,
    max_cudagraph_capture_size,
    tp_size,
    enable_sequence_parallelism,
    max_num_batched_tokens,
    use_cudagraph,
    expected_max_size,
 ):
    ctx = nullcontext()
    if isinstance(expected_max_size, Exception):
        ctx = pytest.raises(expected_max_size)
    cudagraph_mode = CUDAGraphMode.PIECEWISE if use_cudagraph else CUDAGraphMode.NONE
    with ctx:
        compilation_config = CompilationConfig(
            cudagraph_capture_sizes=cudagraph_capture_sizes,
            max_cudagraph_capture_size=max_cudagraph_capture_size,
            pass_config={
                "enable_sequence_parallelism": enable_sequence_parallelism,
                "enable_fusion": True,
                "enable_noop": True,
            },
            cudagraph_mode=cudagraph_mode,
        )
        engine_args = EngineArgs(
            model="facebook/opt-125m",
            tensor_parallel_size=tp_size,
            max_num_batched_tokens=max_num_batched_tokens,
            compilation_config=compilation_config,
        )
        vllm_config = engine_args.create_engine_config()
    assert (
        vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size
    )
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -154,6 +154,8 @@ class CompilationConfig:
        - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
        - [`cudagraph_capture_sizes`]
        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
        - [`max_cudagraph_capture_size`]
        [vllm.config.CompilationConfig.max_cudagraph_capture_size]
        - [`cudagraph_num_of_warmups`]
        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
        - [`cudagraph_copy_inputs`]
@ -327,18 +329,16 @@ class CompilationConfig:
    more modes may be added.
    """
    use_cudagraph: bool = True
-    """Whether to use cudagraph inside compilation.
+    """Whether to use cudagraph inside compilation:
-    - False: cudagraph inside compilation is not used.
+
    - False: cudagraph inside compilation is not used.\n
    - True: cudagraph inside compilation is used. It requires
        that all input buffers have fixed addresses, and all
        splitting ops write their outputs to input buffers.
-    In the vLLM V1 Engine, this flag only applies for
+
    CompilationMode.VLLM_COMPILE (aka -O3).
    Note that this is orthogonal to the cudagraph capture logic
    outside of compilation.
    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=PIECEWISE
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
-    instead.
+    _PIECEWISE instead.
    """
    cudagraph_num_of_warmups: int = 0
    """Number of warmup runs for cudagraph.
@ -398,8 +398,22 @@ class CompilationConfig:
    pass_config: PassConfig = field(default_factory=PassConfig)
    """Custom inductor passes, see PassConfig for more details"""
-    max_capture_size: int = field(default=None, init=False)  # type: ignore
+    max_cudagraph_capture_size: int | None = field(default=None)
-    """not configurable, computed after init"""
+    """The maximum cudagraph capture size.
    If cudagraph_capture_sizes is specified, this will be set to the largest 
    size in that list (or checked for consistency if specified). If
    cudagraph_capture_sizes is not specified, the list of sizes is generated
    automatically following the pattern:
        [1, 2, 4] + list(range(8, 256, 8)) + list(
        range(256, max_cudagraph_capture_size + 1, 16))
    If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
    512) by default. This voids OOM in tight memory scenarios with small 
    max_num_seqs, and prevents capture of many large graphs (>512) that would
    greatly increase startup time with limited performance benefit.
    """
    local_cache_dir: str = field(default=None, init=False)  # type: ignore
    """local cache dir for each rank"""
    bs_to_padded_graph_size: list[int] = field(
@ -408,7 +422,7 @@ class CompilationConfig:
    )
    """optimization:
    Intuitively, bs_to_padded_graph_size should be dict[int, int].
-    since we know all keys are in a range [0, max_capture_size],
+    since we know all keys are in a range [0, max_cudagraph_capture_size],
    we can optimize it to list[int] for better lookup performance."""
    # keep track of enabled and disabled custom ops
@ -672,25 +686,12 @@ class CompilationConfig:
        return VllmBackend(vllm_config)
-    def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None:
+    def post_init_cudagraph_sizes(self) -> None:
-        """To complete the initialization of config,
+        """To complete the initialization after cudagraph related
-        we need to know the cudagraph sizes."""
+        configs are set. This includes:
-
+        - initialize compile_sizes
-        if self.cudagraph_capture_sizes is None:
+        - pre-compute the mapping bs_to_padded_graph_size
-            self.cudagraph_capture_sizes = cudagraph_capture_sizes
+        """
        else:
            # de-duplicate the sizes provided by the config
            dedup_sizes = list(set(self.cudagraph_capture_sizes))
            if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
                logger.info(
                    (
                        "cudagraph sizes specified by model runner"
                        " %s is overridden by config %s"
                    ),
                    cudagraph_capture_sizes,
                    dedup_sizes,
                )
            self.cudagraph_capture_sizes = dedup_sizes
        computed_compile_sizes = []
        if self.compile_sizes is not None:
@ -708,23 +709,24 @@ class CompilationConfig:
                    computed_compile_sizes.append(x)
        self.compile_sizes = computed_compile_sizes  # type: ignore
-        # sort to make sure cudagraph capture sizes are in descending order
+        # make sure the sizes are in ascending order
-        self.cudagraph_capture_sizes.sort(reverse=True)
+        self.cudagraph_capture_sizes.sort()
-        self.max_capture_size = (
+        if self.cudagraph_capture_sizes:
-            self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
+            assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
        )
        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [0 for i in range(self.max_capture_size + 1)]
+        self.bs_to_padded_graph_size = [
            0 for i in range(self.max_cudagraph_capture_size + 1)
        ]
        for end, start in zip(
-            self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]
+            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
            [0] + self.cudagraph_capture_sizes,
        ):
            for bs in range(start, end):
                if bs == start:
                    self.bs_to_padded_graph_size[bs] = start
                else:
                    self.bs_to_padded_graph_size[bs] = end
        self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size
    def set_splitting_ops_for_v1(self):
        # NOTE: this function needs to be called only when mode is
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@ -71,14 +71,6 @@ class SchedulerConfig:
    NOTE: This will be replaced by speculative config in the future; it is
    present to enable correctness tests until then."""
    cuda_graph_sizes: list[int] = field(default_factory=list)
    """Cuda graph capture sizes
    1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
    2. if one value is provided, then the capture list would follow the
    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
    3. more than one value (e.g. 1 2 128) is provided, then the capture list
    will follow the provided list."""
    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
    """If True, prefill requests can be chunked based
    on the remaining max_num_batched_tokens."""
@ -235,13 +227,6 @@ class SchedulerConfig:
                self.long_prefill_token_threshold,
            )
        # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
        # This avoids OOM in tight memory scenarios with small max_num_seqs,
        # and prevents capture of many large graphs (>512) that would greatly
        # increase startup time with limited performance benefit.
        if not self.cuda_graph_sizes:
            self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
        if self.async_scheduling:
            self.scheduler_cls = "vllm.v1.core.sched.async_scheduler.AsyncScheduler"
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -197,10 +197,10 @@ class VllmConfig:
        return hash_str
    def pad_for_cudagraph(self, batch_size: int) -> int:
-        # if batch_size > self.compilation_config.max_capture_size,
+        # if batch_size > self.compilation_config.max_cudagraph_capture_size,
        # it should raise an IndexError.
        # the caller should make sure the batch_size is within the range,
-        # i.e., batch_size <= self.compilation_config.max_capture_size
+        # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size
        return self.compilation_config.bs_to_padded_graph_size[batch_size]
    @staticmethod
@ -396,6 +396,9 @@ class VllmConfig:
            if self.model_config is not None and self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")
                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
                # override related settings when enforce eager
                self.compilation_config.max_cudagraph_capture_size = 0
                self.compilation_config.cudagraph_capture_sizes = []
            elif envs.VLLM_USE_V1:
                self.compilation_config.cudagraph_num_of_warmups = 1
@ -654,11 +657,13 @@ class VllmConfig:
        ```python
        max_graph_size = min(max_num_seqs * 2, 512)
-        # 1, 2, 4, then multiples of 8 up to max_graph_size
+        # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
-        cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
+        # up to max_graph_size
        cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
            range(256, max_graph_size + 1, 16))
        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
-        will be the final sizes to capture cudagraph (in descending order).
+        will be the final sizes to capture cudagraph (in ascending order).
        These sizes are used to capture and reuse CUDA graphs for
        performance-critical paths (e.g., decoding). Capturing enables
@ -685,35 +690,111 @@ class VllmConfig:
            not be used.
        """
-        # calculate the default `batch_size_capture_list`
+        if (
-        batch_size_capture_list = []
+            self.model_config is not None
-        if self.model_config is not None and not self.model_config.enforce_eager:
+            and not self.model_config.enforce_eager
-            cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
+            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            if len(cuda_graph_sizes) == 1:
+        ):
-                max_graph_size = cuda_graph_sizes[0]
+            # determine the initial max_cudagraph_capture_size
-                assert max_graph_size >= 1, (
+            max_cudagraph_capture_size = (
-                    "Maximum cudagraph size should be greater than or equal to 1."
+                self.compilation_config.max_cudagraph_capture_size
            )
            if max_cudagraph_capture_size is None:
                max_cudagraph_capture_size = min(
                    self.scheduler_config.max_num_seqs * 2, 512
                )
-                batch_size_capture_list = [
+            max_num_tokens = self.scheduler_config.max_num_batched_tokens
-                    i for i in [1, 2, 4] if i <= max_graph_size
+            max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
-                ] + list(range(8, max_graph_size + 1, 8))
+
-            elif len(cuda_graph_sizes) > 1:
+            assert max_cudagraph_capture_size >= 1, (
-                batch_size_capture_list = sorted(cuda_graph_sizes)
+                "Maximum cudagraph size should be greater than or equal to 1 "
                "when using cuda graph."
            )
            # determine the cudagraph_capture_sizes
            if self.compilation_config.cudagraph_capture_sizes is not None:
                assert len(self.compilation_config.cudagraph_capture_sizes) > 0, (
                    "cudagraph_capture_sizes should contain at least one element "
                    "when using cuda graph."
                )
                # de-duplicate the sizes provided by the config
                dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
                cudagraph_capture_sizes = dedup_sizes
                # sort to make sure the sizes are in ascending order
                cudagraph_capture_sizes.sort()
            else:
-                raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
+                cudagraph_capture_sizes = [
                    i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
                ]
                if max_cudagraph_capture_size >= 8:
                    # Step size 8 for small batch sizes, up to 256(not included)
                    cudagraph_capture_sizes += list(
                        range(8, min(max_cudagraph_capture_size + 1, 256), 8)
                    )
                if max_cudagraph_capture_size >= 256:
                    # Step size 16 for larger batch sizes
                    cudagraph_capture_sizes += list(
                        range(256, max_cudagraph_capture_size + 1, 16)
                    )
            if (
                self.parallel_config.tensor_parallel_size > 1
                and self.compilation_config.pass_config.enable_sequence_parallelism
            ):
-                batch_size_capture_list = self.update_sizes_for_sequence_parallelism(
+                cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
-                    batch_size_capture_list
+                    cudagraph_capture_sizes
                )
            max_num_tokens = self.scheduler_config.max_num_batched_tokens
            batch_size_capture_list = [
                size for size in batch_size_capture_list if size <= max_num_tokens
            ]
-        self.compilation_config.init_with_cudagraph_sizes(batch_size_capture_list)
+            # user-specific compilation_config.max_cudagraph_capture_size get
            # truncated to valid_max_size when they are inconsistent.
            valid_max_size = (
                cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0
            )
            if (
                self.compilation_config.max_cudagraph_capture_size is not None
                and self.compilation_config.max_cudagraph_capture_size != valid_max_size
            ):
                # raise error only when both two flags are user-specified
                # and they are inconsistent with each other
                if self.compilation_config.cudagraph_capture_sizes is not None:
                    raise ValueError(
                        "customized max_cudagraph_capture_size"
                        f"(={self.compilation_config.max_cudagraph_capture_size}) "
                        "should be consistent with the max value of "
                        f"cudagraph_capture_sizes(={valid_max_size})"
                    )
                logger.warning(
                    "Truncating max_cudagraph_capture_size to %d",
                    valid_max_size,
                )
            # always set the final max_cudagraph_capture_size
            self.compilation_config.max_cudagraph_capture_size = valid_max_size
            if self.compilation_config.cudagraph_capture_sizes is not None and len(
                cudagraph_capture_sizes
            ) < len(self.compilation_config.cudagraph_capture_sizes):
                # If users have specified capture sizes, we only need to
                # compare the lens before and after modification since the modified
                # list is only the subset of the original list.
                logger.warning(
                    (
                        "cudagraph_capture_sizes specified in compilation_config"
                        " %s is overridden by config %s"
                    ),
                    self.compilation_config.cudagraph_capture_sizes,
                    cudagraph_capture_sizes,
                )
            # always write back the final sizes
            self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
        else:
            # no cudagraph in use
            self.compilation_config.max_cudagraph_capture_size = 0
            self.compilation_config.cudagraph_capture_sizes = []
        # complete the remaining process.
        self.compilation_config.post_init_cudagraph_sizes()
    def recalculate_max_model_len(self, max_model_len: int):
        # Can only be called in try_verify_and_update_config
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -364,7 +364,13 @@ class EngineArgs:
    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
    seed: int | None = ModelConfig.seed
    max_model_len: int | None = ModelConfig.max_model_len
-    cuda_graph_sizes: list[int] = get_field(SchedulerConfig, "cuda_graph_sizes")
+    cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
    cudagraph_capture_sizes: list[int] | None = (
        CompilationConfig.cudagraph_capture_sizes
    )
    max_cudagraph_capture_size: int | None = get_field(
        CompilationConfig, "max_cudagraph_capture_size"
    )
    # Note: Specifying a custom executor backend by passing a class
    # is intended for expert use only. The API may change without
    # notice.
@ -1007,9 +1013,6 @@ class EngineArgs:
            "--max-long-partial-prefills",
            **scheduler_kwargs["max_long_partial_prefills"],
        )
        scheduler_group.add_argument(
            "--cuda-graph-sizes", **scheduler_kwargs["cuda_graph_sizes"]
        )
        scheduler_group.add_argument(
            "--long-prefill-token-threshold",
            **scheduler_kwargs["long_prefill_token_threshold"],
@ -1039,6 +1042,29 @@ class EngineArgs:
            "--async-scheduling", **scheduler_kwargs["async_scheduling"]
        )
        # Compilation arguments
        compilation_kwargs = get_kwargs(CompilationConfig)
        compilation_group = parser.add_argument_group(
            title="CompilationConfig",
            description=CompilationConfig.__doc__,
        )
        compilation_group.add_argument(
            "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
        )
        compilation_kwargs["cudagraph_capture_sizes"]["help"] = (
            "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0,"
            " whichever is soonest. Please use --cudagraph-capture-sizes instead."
        )
        compilation_group.add_argument(
            "--cuda-graph-sizes",
            **compilation_kwargs["cudagraph_capture_sizes"],
            deprecated=True,
        )
        compilation_group.add_argument(
            "--max-cudagraph-capture-size",
            **compilation_kwargs["max_cudagraph_capture_size"],
        )
        # vLLM arguments
        vllm_kwargs = get_kwargs(VllmConfig)
        vllm_group = parser.add_argument_group(
@ -1548,7 +1574,6 @@ class EngineArgs:
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
            cuda_graph_sizes=self.cuda_graph_sizes,
            num_lookahead_slots=num_lookahead_slots,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_chunked_mm_input=self.disable_chunked_mm_input,
@ -1616,6 +1641,38 @@ class EngineArgs:
            collect_detailed_traces=self.collect_detailed_traces,
        )
        # Compilation config overrides
        if self.cuda_graph_sizes is not None:
            logger.warning(
                "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
                "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
                "instead."
            )
            if self.compilation_config.cudagraph_capture_sizes is not None:
                raise ValueError(
                    "cuda_graph_sizes and compilation_config."
                    "cudagraph_capture_sizes are mutually exclusive"
                )
            self.compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
        if self.cudagraph_capture_sizes is not None:
            if self.compilation_config.cudagraph_capture_sizes is not None:
                raise ValueError(
                    "cudagraph_capture_sizes and compilation_config."
                    "cudagraph_capture_sizes are mutually exclusive"
                )
            self.compilation_config.cudagraph_capture_sizes = (
                self.cudagraph_capture_sizes
            )
        if self.max_cudagraph_capture_size is not None:
            if self.compilation_config.max_cudagraph_capture_size is not None:
                raise ValueError(
                    "max_cudagraph_capture_size and compilation_config."
                    "max_cudagraph_capture_size are mutually exclusive"
                )
            self.compilation_config.max_cudagraph_capture_size = (
                self.max_cudagraph_capture_size
            )
        config = VllmConfig(
            model_config=model_config,
            cache_config=cache_config,
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@ -185,7 +185,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
        self.moe = moe
        self.mxfp4_backend = get_mxfp4_backend()
        self.max_capture_size = (
-            get_current_vllm_config().compilation_config.max_capture_size
+            get_current_vllm_config().compilation_config.max_cudagraph_capture_size
        )
        assert self.mxfp4_backend != Mxfp4Backend.NONE, (
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@ -259,21 +259,19 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
        # Increase the max capture size from 512 to 992 for performance.
        # NOTE(woosuk): This will increase the number of CUDA graphs
        # from 67 to 81.
-        scheduler_config = vllm_config.scheduler_config
+        compilation_config = vllm_config.compilation_config
-        if len(scheduler_config.cuda_graph_sizes) == 1:
+        # Only override when the user has not set either of
-            max_capture_size = scheduler_config.cuda_graph_sizes[0]
+        # cudagraph_capture_sizes or max_cudagraph_capture_size.
        if (
            compilation_config.cudagraph_capture_sizes is None
            and compilation_config.max_cudagraph_capture_size is None
        ):
            # FIXME(woosuk): When using full cuda graph with FA3, the max
            # supported size is 992.
-            if max_capture_size < 992:
+            compilation_config.max_cudagraph_capture_size = 992
-                cuda_graph_sizes = [1, 2, 4]
+            logger.info(
-                # Step size 8 for small batch sizes
+                "Overriding max cuda graph capture size to %d for performance.", 992
-                cuda_graph_sizes += [i for i in range(8, 256, 8)]
+            )
                # Step size 16 for larger batch sizes
                cuda_graph_sizes += [i for i in range(256, 993, 16)]
                scheduler_config.cuda_graph_sizes = cuda_graph_sizes
                logger.info(
                    "Overriding max cuda graph capture size to %d for performance.", 992
                )
 class MambaModelConfig(VerifyAndUpdateConfig):
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@ -236,7 +236,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
        self.use_full_cuda_graph = (
            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
        )
-        self.max_cudagraph_size = self.compilation_config.max_capture_size
+        self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
        if self.use_full_cuda_graph and self.aot_schedule:
            if self.max_cudagraph_size > 992:
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -324,7 +324,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
            ] = {}
            self._decode_cudagraph_max_bs = min(
                (1 + num_spec_tokens) * max_num_reqs,
-                self.compilation_config.max_capture_size,
+                self.compilation_config.max_cudagraph_capture_size,
            )
        self.num_qo_heads = self.model_config.get_num_attention_heads(
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@ -87,7 +87,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
        )
        self.decode_cudagraph_max_bs = min(
            self.vllm_config.scheduler_config.max_num_seqs * (self.num_spec + 1),
-            self.compilation_config.max_capture_size,
+            self.compilation_config.max_cudagraph_capture_size,
        )
        self.spec_state_indices_tensor = torch.empty(
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@ -36,7 +36,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
        self.compilation_config = vllm_config.compilation_config
        self.decode_cudagraph_max_bs = min(
            self.vllm_config.scheduler_config.max_num_seqs,
-            self.compilation_config.max_capture_size,
+            self.compilation_config.max_cudagraph_capture_size,
        )
        self.state_indices_tensor = torch.empty(
            (self.decode_cudagraph_max_bs,),
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@ -89,7 +89,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
        self.use_full_cuda_graph = (
            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
        )
-        self.max_cudagraph_size = self.compilation_config.max_capture_size
+        self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
        if self.use_full_cuda_graph and self.fa_aot_schedule:
            if self.max_cudagraph_size > 992:
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -104,7 +104,7 @@ class EagleProposer:
            )
        self.cudagraph_batch_sizes = (
-            list(reversed(self.vllm_config.compilation_config.cudagraph_capture_sizes))
+            (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes))
            if self.use_cuda_graph
            else []
        )
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -379,16 +379,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            self.async_output_copy_stream = torch.cuda.Stream()
            self.prepare_inputs_event = torch.cuda.Event()
        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
        # The convention is different.
        # self.cudagraph_batch_sizes sorts in ascending order.
        # The batch sizes in the config are in descending order.
        if (
            self.compilation_config.cudagraph_capture_sizes
            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
        ):
-            self.cudagraph_batch_sizes = list(
+            self.cudagraph_batch_sizes = sorted(
-                reversed(self.compilation_config.cudagraph_capture_sizes)
+                self.compilation_config.cudagraph_capture_sizes
            )
        # Cache the device properties.
@ -3791,7 +3788,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
                cudagraph_runtime_mode = cudagraph_mode.mixed_mode()
-
+                # make sure we capture the largest batch size first
                compilation_cases = list(
                    product(reversed(self.cudagraph_batch_sizes), lora_cases)
                )