[misc] instruct pytorch to use nvml-based cuda check (#15951)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2026-06-01 21:37:57 +08:00 · 2025-04-03 01:02:58 +08:00 · 2025-04-03 01:02:58 +08:00 · 1cab43c2d2
commit 1cab43c2d2
parent 8bd651b318
2 changed files with 25 additions and 16 deletions
--- a/vllm/init.py
+++ b/vllm/init.py
@ -4,9 +4,10 @@
 # version library first.  Such assumption is critical for some customization.
 from .version import __version__, __version_tuple__  # isort:skip
-import os
+# The environment variables override should be imported before any other
-
+# modules to ensure that the environment variables are set before any
-import torch
+# other modules are imported.
 import vllm.env_override  # isort:skip  # noqa: F401
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -23,19 +24,6 @@ from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 # set some common config/environment variables that should be set
 # for all processes created by vllm and all processes
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 # see https://github.com/NVIDIA/nccl/issues/1234
 os.environ['NCCL_CUMEM_ENABLE'] = '0'
 # see https://github.com/vllm-project/vllm/issues/10480
 os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
 # see https://github.com/vllm-project/vllm/issues/10619
 torch._inductor.config.compile_threads = 1
 __all__ = [
    "__version__",
    "__version_tuple__",
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@ -0,0 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import torch
 # set some common config/environment variables that should be set
 # for all processes created by vllm and all processes
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 # see https://github.com/NVIDIA/nccl/issues/1234
 os.environ['NCCL_CUMEM_ENABLE'] = '0'
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
 os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
 # see https://github.com/vllm-project/vllm/issues/10480
 os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
 # see https://github.com/vllm-project/vllm/issues/10619
 torch._inductor.config.compile_threads = 1