mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-17 05:55:01 +08:00
[Bugfix] More type hint fixes for py 3.8 (#4039)
This commit is contained in:
parent
546e721168
commit
5c2e66e487
@ -39,7 +39,7 @@ class ExecutorBase(ABC):
|
|||||||
ExecutorBase may require modification of the result, e.g. to ensure the
|
ExecutorBase may require modification of the result, e.g. to ensure the
|
||||||
selected cache sizes are compatible with all workers.
|
selected cache sizes are compatible with all workers.
|
||||||
|
|
||||||
Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
||||||
are blocks that are "active" on the device and can be appended to.
|
are blocks that are "active" on the device and can be appended to.
|
||||||
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
||||||
appended to.
|
appended to.
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
"""A CPU worker class."""
|
"""A CPU worker class."""
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
@ -157,7 +157,7 @@ class CPUWorker(LoraNotSupportedWorkerBase):
|
|||||||
def load_model(self):
|
def load_model(self):
|
||||||
self.model_runner.load_model()
|
self.model_runner.load_model()
|
||||||
|
|
||||||
def determine_num_available_blocks(self) -> tuple[int, int]:
|
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||||
"""Determine the number of blocks available for the KV cache.
|
"""Determine the number of blocks available for the KV cache.
|
||||||
|
|
||||||
This determines how many KV blocks can fit into the configured CPU
|
This determines how many KV blocks can fit into the configured CPU
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
"""A Neuron worker class."""
|
"""A Neuron worker class."""
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
@ -40,7 +40,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase):
|
|||||||
def load_model(self):
|
def load_model(self):
|
||||||
self.model_runner.load_model()
|
self.model_runner.load_model()
|
||||||
|
|
||||||
def determine_num_available_blocks(self) -> tuple[int, int]:
|
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||||
"""Determine the number of available KV blocks.
|
"""Determine the number of available KV blocks.
|
||||||
|
|
||||||
Swapping is not yet supported, so always return num_cpu_blocks=0.
|
Swapping is not yet supported, so always return num_cpu_blocks=0.
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Dict, List
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||||
@ -18,14 +18,14 @@ class WorkerBase(ABC):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def determine_num_available_blocks(self) -> tuple[int, int]:
|
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||||
"""Determine the number of available blocks for the GPU KV cache and
|
"""Determine the number of available blocks for the GPU KV cache and
|
||||||
swappable CPU KV cache.
|
swappable CPU KV cache.
|
||||||
|
|
||||||
The implementation may run profiling or other heuristics to determine
|
The implementation may run profiling or other heuristics to determine
|
||||||
the size of caches.
|
the size of caches.
|
||||||
|
|
||||||
Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
||||||
are blocks that are "active" on the device and can be appended to.
|
are blocks that are "active" on the device and can be appended to.
|
||||||
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
||||||
appended to.
|
appended to.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user