[Fix] Do not pin memory when in WSL (#312)

2026-01-28 05:47:14 +08:00 · 2023-06-29 15:00:21 -07:00 · 2023-06-29 15:00:21 -07:00 · 85de093472
commit 85de093472
parent f72297562f
2 changed files with 17 additions and 2 deletions
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -1,4 +1,5 @@
 import enum
+from platform import uname
 import uuid

 import psutil
@ -36,3 +37,7 @@ def get_cpu_memory() -> int:

 def random_uuid() -> str:
    return str(uuid.uuid4().hex)
+
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@ -5,6 +5,10 @@ import torch

 from vllm import cache_ops
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import in_wsl
+
+logger = init_logger(__name__)

 KVCache = Tuple[torch.Tensor, torch.Tensor]

@ -85,16 +89,22 @@ class CacheEngine:
        cpu_cache: List[KVCache] = []
        key_block_shape = self.get_key_block_shape()
        value_block_shape = self.get_value_block_shape()
+        pin_memory = not in_wsl()
+        if not pin_memory:
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warn("Using 'pin_memory=False' as WSL is detected. "
+                        "This may slow down the performance.")
        for _ in range(self.num_layers):
            key_blocks = torch.empty(
                size=(self.num_cpu_blocks, *key_block_shape),
                dtype=self.dtype,
-                pin_memory=True,
+                pin_memory=pin_memory,
            )
            value_blocks = torch.empty(
                size=(self.num_cpu_blocks, *value_block_shape),
                dtype=self.dtype,
-                pin_memory=True,
+                pin_memory=pin_memory,
            )
            cpu_cache.append((key_blocks, value_blocks))
        return cpu_cache