Disable cuda version check in vllm-openai image (#4530)

2025-12-17 12:05:38 +08:00 · 2024-05-06 07:58:55 +08:00 · 2024-05-06 07:58:55 +08:00 · 0650e5935b
commit 0650e5935b
parent c7f2cf2b7f
2 changed files with 2 additions and 33 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -4,15 +4,13 @@ from dataclasses import dataclass, field, fields
 from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
 import torch
 from packaging.version import Version
 from transformers import PretrainedConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                     get_quantization_config)
 from vllm.transformers_utils.config import get_config, get_hf_text_config
-from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
+from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
                        is_neuron)
 GPTQMarlinConfig = get_quantization_config("gptq_marlin")
@ -369,13 +367,6 @@ class CacheConfig:
        if self.cache_dtype == "auto":
            pass
        elif self.cache_dtype == "fp8":
            if not is_hip():
                nvcc_cuda_version = get_nvcc_cuda_version()
                if nvcc_cuda_version is not None \
                        and nvcc_cuda_version < Version("11.8"):
                    raise ValueError(
                        "FP8 is not supported when cuda version is"
                        "lower than 11.8.")
            logger.info(
                "Using fp8 data type to store kv cache. It reduces the GPU "
                "memory footprint and boosts the performance. "
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -19,7 +19,6 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
 import psutil
 import torch
 from packaging.version import Version, parse
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
@ -314,27 +313,6 @@ def cdiv(a: int, b: int) -> int:
    return -(a // -b)
@lru_cache(maxsize=None)
 def get_nvcc_cuda_version() -> Optional[Version]:
    cuda_home = envs.CUDA_HOME
    if not cuda_home:
        cuda_home = '/usr/local/cuda'
        if os.path.isfile(cuda_home + '/bin/nvcc'):
            logger.info(
                'CUDA_HOME is not found in the environment. '
                'Using %s as CUDA_HOME.', cuda_home)
        else:
            logger.warning('Not found nvcc in %s. Skip cuda version check!',
                           cuda_home)
            return None
    nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version
 def _generate_random_fp8(
    tensor: torch.tensor,
    low: float,
@ -560,7 +538,7 @@ def maybe_expand_dim(tensor: torch.Tensor,
 def merge_dicts(dict1: Dict[Any, List[Any]],
                dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
    """Merge 2 dicts that have key -> List of items.
-    
+
    When a key conflicts, the values in dict1 is prioritized.
    """
    merged_dict = defaultdict(list)