mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:26:00 +08:00
[Fix] Do not pin memory when in WSL (#312)
This commit is contained in:
parent
f72297562f
commit
85de093472
@ -1,4 +1,5 @@
|
||||
import enum
|
||||
from platform import uname
|
||||
import uuid
|
||||
|
||||
import psutil
|
||||
@ -36,3 +37,7 @@ def get_cpu_memory() -> int:
|
||||
|
||||
def random_uuid() -> str:
|
||||
return str(uuid.uuid4().hex)
|
||||
|
||||
def in_wsl() -> bool:
|
||||
# Reference: https://github.com/microsoft/WSL/issues/4071
|
||||
return "microsoft" in " ".join(uname()).lower()
|
||||
|
||||
@ -5,6 +5,10 @@ import torch
|
||||
|
||||
from vllm import cache_ops
|
||||
from vllm.config import CacheConfig, ModelConfig, ParallelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import in_wsl
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||
|
||||
@ -85,16 +89,22 @@ class CacheEngine:
|
||||
cpu_cache: List[KVCache] = []
|
||||
key_block_shape = self.get_key_block_shape()
|
||||
value_block_shape = self.get_value_block_shape()
|
||||
pin_memory = not in_wsl()
|
||||
if not pin_memory:
|
||||
# Pinning memory in WSL is not supported.
|
||||
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
|
||||
logger.warn("Using 'pin_memory=False' as WSL is detected. "
|
||||
"This may slow down the performance.")
|
||||
for _ in range(self.num_layers):
|
||||
key_blocks = torch.empty(
|
||||
size=(self.num_cpu_blocks, *key_block_shape),
|
||||
dtype=self.dtype,
|
||||
pin_memory=True,
|
||||
pin_memory=pin_memory,
|
||||
)
|
||||
value_blocks = torch.empty(
|
||||
size=(self.num_cpu_blocks, *value_block_shape),
|
||||
dtype=self.dtype,
|
||||
pin_memory=True,
|
||||
pin_memory=pin_memory,
|
||||
)
|
||||
cpu_cache.append((key_blocks, value_blocks))
|
||||
return cpu_cache
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user