[Frontend] [Core] Tensorizer: support dynamic num_readers, update version (#4467)

2025-12-14 05:45:01 +08:00 · 2024-04-30 19:32:13 -04:00 · 2024-04-30 19:32:13 -04:00 · 715c2d854d
commit 715c2d854d
parent a494140433
3 changed files with 12 additions and 9 deletions
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -14,7 +14,7 @@ types-setuptools
 # testing
 pytest
-tensorizer==2.9.0a0
+tensorizer==2.9.0
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
--- a/setup.py
+++ b/setup.py
@ -408,7 +408,7 @@ setup(
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    extras_require={
-        "tensorizer": ["tensorizer==2.9.0a1"],
+        "tensorizer": ["tensorizer==2.9.0"],
    },
    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
    package_data=package_data,
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@ -44,7 +44,7 @@ class TensorizerConfig:
                          str, bytes, os.PathLike, int]
    vllm_tensorized: bool
    verify_hash: Optional[bool] = False
-    num_readers: Optional[int] = 1
+    num_readers: Optional[int] = None
    encryption_keyfile: Optional[str] = None
    s3_access_key_id: Optional[str] = None
    s3_secret_access_key: Optional[str] = None
@ -104,7 +104,7 @@ class TensorizerArgs:
                          str, bytes, os.PathLike, int]
    vllm_tensorized: bool
    verify_hash: Optional[bool] = False
-    num_readers: Optional[int] = 1
+    num_readers: Optional[int] = None
    encryption_keyfile: Optional[str] = None
    s3_access_key_id: Optional[str] = None
    s3_secret_access_key: Optional[str] = None
@ -125,8 +125,9 @@ class TensorizerArgs:
          the hashes stored in the metadata. A `HashMismatchError` will be 
          raised if any of the hashes do not match.
      num_readers: Controls how many threads are allowed to read concurrently
-          from the source file. Default is 1. This greatly increases
+          from the source file. Default is `None`, which will dynamically set
-          performance.
+          the number of readers based on the number of available 
          resources and model size. This greatly increases performance.
      encryption_keyfile: File path to a binary file containing a  
          binary key to use for decryption. `None` (the default) means 
          no decryption. See the example script in 
@ -199,10 +200,12 @@ class TensorizerArgs:
            "use for decryption. Can be a file path or S3 network URI.")
        group.add_argument(
            "--num-readers",
-            default=1,
+            default=None,
            type=int,
            help="Controls how many threads are allowed to read concurrently "
-            "from the source file.")
+            "from the source file. Default is `None`, which will dynamically "
            "set the number of readers based on the available resources "
            "and model size. This greatly increases performance.")
        group.add_argument(
            "--s3-access-key-id",
            default=None,
@ -337,7 +340,7 @@ class TensorizerAgent:
        per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
        after_mem = get_mem_usage()
        deserializer.close()
-        logger.info("Deserialized %s in %0.2fs, %f/s", total_bytes_str,
+        logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
                    end - start, per_second)
        logger.info("Memory usage before: %s", before_mem)
        logger.info("Memory usage after: %s", after_mem)