compilation is fixed

2026-07-10 20:17:17 +08:00 · 2025-02-06 20:49:29 +00:00 · 2025-02-06 20:49:29 +00:00 · 70b4e46e70
commit 70b4e46e70
parent 5fb9dbe6f6
3 changed files with 42 additions and 13 deletions
--- a/requirements-common.txt
+++ b/requirements-common.txt
@ -5,7 +5,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
+transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
@ -34,6 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors
+compressed-tensors == 0.9.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
--- a/requirements-test.txt
+++ b/requirements-test.txt
@ -106,9 +106,17 @@ dnspython==2.7.0
 docutils==0.16
    # via awscli
 einops==0.8.0
-    # via -r requirements-test.in
+    # via
    #   -r requirements-test.in
    #   encodec
    #   vector-quantize-pytorch
    #   vocos
 einx==0.3.0
    # via vector-quantize-pytorch
 email-validator==2.2.0
    # via pydantic
 encodec==0.1.1
    # via vocos
 evaluate==0.4.3
    # via lm-eval
 fastparquet==2024.11.0
@ -125,6 +133,8 @@ filelock==3.16.1
    #   triton
 fonttools==4.54.1
    # via matplotlib
 frozendict==2.4.6
    # via einx
 frozenlist==1.5.0
    # via
    #   aiohttp
@ -159,6 +169,7 @@ huggingface-hub==0.26.2
    #   timm
    #   tokenizers
    #   transformers
    #   vocos
 idna==3.10
    # via
    #   anyio
@ -261,6 +272,8 @@ numpy==1.26.4
    #   cupy-cuda12x
    #   datasets
    #   decord
    #   einx
    #   encodec
    #   evaluate
    #   fastparquet
    #   genai-perf
@ -283,6 +296,7 @@ numpy==1.26.4
    #   torchvision
    #   transformers
    #   tritonclient
    #   vocos
 nvidia-cublas-cu12==12.4.5.8
    # via
    #   nvidia-cudnn-cu12
@ -455,6 +469,7 @@ pyyaml==6.0.2
    #   responses
    #   timm
    #   transformers
    #   vocos
 ray[adag]==2.40.0
    # via -r requirements-test.in
 redis==5.2.0
@ -517,6 +532,7 @@ scipy==1.13.1
    #   scikit-learn
    #   sentence-transformers
    #   statsmodels
    #   vocos
 sentence-transformers==3.2.1
    # via -r requirements-test.in
 sentencepiece==0.2.0
@ -540,7 +556,9 @@ sqlitedict==2.1.0
 statsmodels==0.14.4
    # via genai-perf
 sympy==1.13.1
-    # via torch
+    # via
    #   einx
    #   torch
 tabledata==1.3.3
    # via pytablewriter
 tabulate==0.9.0
@ -568,12 +586,21 @@ torch==2.5.1
    #   -r requirements-test.in
    #   accelerate
    #   bitsandbytes
    #   encodec
    #   lm-eval
    #   peft
    #   sentence-transformers
    #   tensorizer
    #   timm
    #   torchaudio
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
 torchaudio==2.5.1
    # via
    #   -r requirements-test.in
    #   encodec
    #   vocos
 torchvision==0.20.1
    # via timm
 tqdm==4.66.6
@ -584,13 +611,15 @@ tqdm==4.66.6
    #   lm-eval
    #   nltk
    #   peft
    #   pqdm
    #   sentence-transformers
    #   tqdm-multiprocess
    #   transformers
 tqdm-multiprocess==0.0.11
    # via lm-eval
-transformers==4.47.0
+transformers==4.48.2
    # via
    #   -r requirements-test.in
    #   genai-perf
    #   lm-eval
    #   peft
@ -615,6 +644,7 @@ typing-extensions==4.12.2
    #   huggingface-hub
    #   librosa
    #   mistral-common
    #   pqdm
    #   pydantic
    #   pydantic-core
    #   torch
@ -626,6 +656,10 @@ urllib3==2.2.3
    #   requests
    #   responses
    #   tritonclient
 vector-quantize-pytorch==1.21.2
    # via -r requirements-test.in
 vocos==0.1.0
    # via -r requirements-test.in
 word2number==1.1
    # via lm-eval
 xxhash==3.5.0
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@ -766,11 +766,6 @@ class TPUModelRunner(ModelRunnerBase):
        logger.info("    -- Compilation for decode done in %.2f [secs].",
                    end - start)
    def _initialize_kv_cache(self):
        kv_cache_spec = self.get_kv_cache_spec()
        kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                              availble_gpu_memory)
    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
        """
        Initialize KV cache based on `kv_cache_config`.