[Performance][gpt-oss] Revert gpt-oss max cudagraph size to 1024 (#28345)

Signed-off-by: Mohammad Miadh Angkad <MAngkad.BSDSBA2027@aim.edu>
2026-03-16 14:17:16 +08:00 · 2025-11-09 06:50:10 +08:00 · 2025-11-09 06:50:10 +08:00 · 404d7a9d14
commit 404d7a9d14
parent 171133f929
1 changed files with 4 additions and 6 deletions
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@ -258,9 +258,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
        if structured_outputs_config.reasoning_parser == "":
            structured_outputs_config.reasoning_parser = "openai_gptoss"

-        # Increase the max capture size from 512 to 992 for performance.
+        # Increase the max capture size from 512 to 1024 for performance.
        # NOTE(woosuk): This will increase the number of CUDA graphs
-        # from 67 to 81.
+        # from 67 to 83.
        compilation_config = vllm_config.compilation_config
        # Only override when the user has not set either of
        # cudagraph_capture_sizes or max_cudagraph_capture_size.
@ -268,11 +268,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
            compilation_config.cudagraph_capture_sizes is None
            and compilation_config.max_cudagraph_capture_size is None
        ):
-            # FIXME(woosuk): When using full cuda graph with FA3, the max
-            # supported size is 992.
-            compilation_config.max_cudagraph_capture_size = 992
+            compilation_config.max_cudagraph_capture_size = 1024
            logger.info(
-                "Overriding max cuda graph capture size to %d for performance.", 992
+                "Overriding max cuda graph capture size to %d for performance.", 1024
            )