diff --git a/vllm/envs.py b/vllm/envs.py index cd545f32c4301..dc23c8ea5314d 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -811,10 +811,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # all2all backend for vllm's expert parallel communication "VLLM_ALL2ALL_BACKEND": lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), - - # check that the cudagraphs input addresses are correct before replaying - "VLLM_CUDAGRAPH_SANITIZER": - lambda: os.getenv("VLLM_CUDAGRAPH_SANITIZER", "0") == "1", } # end-env-vars-definition diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 822bde906dc97..f1cb77f64eae7 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -29,7 +29,6 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum from vllm.utils import direct_register_custom_op -from vllm.v1.worker.ubatching import get_current_ubatch_context has_pplx = importlib.util.find_spec("pplx_kernels") is not None @@ -306,6 +305,7 @@ class AllToAllCache: self._cache[key] = instance return instance + # Global singleton _all_to_all_cache = AllToAllCache()