mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 01:55:32 +08:00
[Feature] Add VLLM_DISABLE_PAD_FOR_CUDAGRAPH to Avoid Hang Issue (#23595)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
f9ca2b40a0
commit
321938e9ac
@ -166,6 +166,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
||||||
VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
|
VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
|
||||||
VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
|
VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
|
||||||
|
VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@ -1144,6 +1145,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_ENABLE_CUDAGRAPH_GC":
|
"VLLM_ENABLE_CUDAGRAPH_GC":
|
||||||
lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
|
lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
|
||||||
|
|
||||||
|
# Disable padding to CUDA graph capture batch sizes.
|
||||||
|
# TODO(wentao): https://github.com/vllm-project/vllm/issues/23378
|
||||||
|
# After the issue is fixed, we can remove this flag.
|
||||||
|
"VLLM_DISABLE_PAD_FOR_CUDAGRAPH":
|
||||||
|
lambda: bool(int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))),
|
||||||
|
|
||||||
# Used to force set up loopback IP
|
# Used to force set up loopback IP
|
||||||
"VLLM_LOOPBACK_IP":
|
"VLLM_LOOPBACK_IP":
|
||||||
lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
|
lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
|
||||||
|
|||||||
@ -1491,6 +1491,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||||
|
and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH
|
||||||
and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
|
and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
|
||||||
# Use CUDA graphs.
|
# Use CUDA graphs.
|
||||||
# Add padding to the batch size.
|
# Add padding to the batch size.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user