diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 8fa54454907b..bd1cac2ab9b5 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -34,6 +34,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() + # Disable torch async compiling which won't work with daemonic processes + os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + from torch.cuda import device_count assert world_size <= device_count(), ( "please set tensor_parallel_size to less than max local gpu count")