Disable cuda version check in vllm-openai image (#4530)

This commit is contained in:
zhaoyang-star 2024-05-06 07:58:55 +08:00 committed by GitHub
parent c7f2cf2b7f
commit 0650e5935b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 2 additions and 33 deletions

View File

@ -4,15 +4,13 @@ from dataclasses import dataclass, field, fields
from typing import TYPE_CHECKING, ClassVar, List, Optional, Union from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
import torch import torch
from packaging.version import Version
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
get_quantization_config) get_quantization_config)
from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip, from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
is_neuron)
GPTQMarlinConfig = get_quantization_config("gptq_marlin") GPTQMarlinConfig = get_quantization_config("gptq_marlin")
@ -369,13 +367,6 @@ class CacheConfig:
if self.cache_dtype == "auto": if self.cache_dtype == "auto":
pass pass
elif self.cache_dtype == "fp8": elif self.cache_dtype == "fp8":
if not is_hip():
nvcc_cuda_version = get_nvcc_cuda_version()
if nvcc_cuda_version is not None \
and nvcc_cuda_version < Version("11.8"):
raise ValueError(
"FP8 is not supported when cuda version is"
"lower than 11.8.")
logger.info( logger.info(
"Using fp8 data type to store kv cache. It reduces the GPU " "Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. " "memory footprint and boosts the performance. "

View File

@ -19,7 +19,6 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
import psutil import psutil
import torch import torch
from packaging.version import Version, parse
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger from vllm.logger import enable_trace_function_call, init_logger
@ -314,27 +313,6 @@ def cdiv(a: int, b: int) -> int:
return -(a // -b) return -(a // -b)
@lru_cache(maxsize=None)
def get_nvcc_cuda_version() -> Optional[Version]:
cuda_home = envs.CUDA_HOME
if not cuda_home:
cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info(
'CUDA_HOME is not found in the environment. '
'Using %s as CUDA_HOME.', cuda_home)
else:
logger.warning('Not found nvcc in %s. Skip cuda version check!',
cuda_home)
return None
nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
universal_newlines=True)
output = nvcc_output.split()
release_idx = output.index("release") + 1
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
return nvcc_cuda_version
def _generate_random_fp8( def _generate_random_fp8(
tensor: torch.tensor, tensor: torch.tensor,
low: float, low: float,
@ -560,7 +538,7 @@ def maybe_expand_dim(tensor: torch.Tensor,
def merge_dicts(dict1: Dict[Any, List[Any]], def merge_dicts(dict1: Dict[Any, List[Any]],
dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]: dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
"""Merge 2 dicts that have key -> List of items. """Merge 2 dicts that have key -> List of items.
When a key conflicts, the values in dict1 is prioritized. When a key conflicts, the values in dict1 is prioritized.
""" """
merged_dict = defaultdict(list) merged_dict = defaultdict(list)