mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-17 12:05:38 +08:00
Disable cuda version check in vllm-openai image (#4530)
This commit is contained in:
parent
c7f2cf2b7f
commit
0650e5935b
@ -4,15 +4,13 @@ from dataclasses import dataclass, field, fields
|
|||||||
from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
|
from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from packaging.version import Version
|
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
|
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
|
||||||
get_quantization_config)
|
get_quantization_config)
|
||||||
from vllm.transformers_utils.config import get_config, get_hf_text_config
|
from vllm.transformers_utils.config import get_config, get_hf_text_config
|
||||||
from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
|
from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
|
||||||
is_neuron)
|
|
||||||
|
|
||||||
GPTQMarlinConfig = get_quantization_config("gptq_marlin")
|
GPTQMarlinConfig = get_quantization_config("gptq_marlin")
|
||||||
|
|
||||||
@ -369,13 +367,6 @@ class CacheConfig:
|
|||||||
if self.cache_dtype == "auto":
|
if self.cache_dtype == "auto":
|
||||||
pass
|
pass
|
||||||
elif self.cache_dtype == "fp8":
|
elif self.cache_dtype == "fp8":
|
||||||
if not is_hip():
|
|
||||||
nvcc_cuda_version = get_nvcc_cuda_version()
|
|
||||||
if nvcc_cuda_version is not None \
|
|
||||||
and nvcc_cuda_version < Version("11.8"):
|
|
||||||
raise ValueError(
|
|
||||||
"FP8 is not supported when cuda version is"
|
|
||||||
"lower than 11.8.")
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Using fp8 data type to store kv cache. It reduces the GPU "
|
"Using fp8 data type to store kv cache. It reduces the GPU "
|
||||||
"memory footprint and boosts the performance. "
|
"memory footprint and boosts the performance. "
|
||||||
|
|||||||
@ -19,7 +19,6 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
|
|||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
import torch
|
import torch
|
||||||
from packaging.version import Version, parse
|
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.logger import enable_trace_function_call, init_logger
|
from vllm.logger import enable_trace_function_call, init_logger
|
||||||
@ -314,27 +313,6 @@ def cdiv(a: int, b: int) -> int:
|
|||||||
return -(a // -b)
|
return -(a // -b)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
|
||||||
def get_nvcc_cuda_version() -> Optional[Version]:
|
|
||||||
cuda_home = envs.CUDA_HOME
|
|
||||||
if not cuda_home:
|
|
||||||
cuda_home = '/usr/local/cuda'
|
|
||||||
if os.path.isfile(cuda_home + '/bin/nvcc'):
|
|
||||||
logger.info(
|
|
||||||
'CUDA_HOME is not found in the environment. '
|
|
||||||
'Using %s as CUDA_HOME.', cuda_home)
|
|
||||||
else:
|
|
||||||
logger.warning('Not found nvcc in %s. Skip cuda version check!',
|
|
||||||
cuda_home)
|
|
||||||
return None
|
|
||||||
nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
|
|
||||||
universal_newlines=True)
|
|
||||||
output = nvcc_output.split()
|
|
||||||
release_idx = output.index("release") + 1
|
|
||||||
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
|
|
||||||
return nvcc_cuda_version
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_random_fp8(
|
def _generate_random_fp8(
|
||||||
tensor: torch.tensor,
|
tensor: torch.tensor,
|
||||||
low: float,
|
low: float,
|
||||||
@ -560,7 +538,7 @@ def maybe_expand_dim(tensor: torch.Tensor,
|
|||||||
def merge_dicts(dict1: Dict[Any, List[Any]],
|
def merge_dicts(dict1: Dict[Any, List[Any]],
|
||||||
dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
|
dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
|
||||||
"""Merge 2 dicts that have key -> List of items.
|
"""Merge 2 dicts that have key -> List of items.
|
||||||
|
|
||||||
When a key conflicts, the values in dict1 is prioritized.
|
When a key conflicts, the values in dict1 is prioritized.
|
||||||
"""
|
"""
|
||||||
merged_dict = defaultdict(list)
|
merged_dict = defaultdict(list)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user