Support download models from www.modelscope.cn (#1588)

2026-06-22 06:07:11 +08:00 · 2023-11-18 12:38:31 +08:00 · 2023-11-18 12:38:31 +08:00 · edb305584b
commit edb305584b
parent bb00f66e19
4 changed files with 58 additions and 4 deletions
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@ -40,6 +40,16 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O

    llm = LLM(model="facebook/opt-125m")

+Use model from www.modelscope.cn
+
+.. code-block:: shell
+
+    export VLLM_USE_MODELSCOPE=True
+
+.. code-block:: python
+
+    llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
+
 Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.

 .. code-block:: python
@ -67,6 +77,16 @@ Start the server:

    $ python -m vllm.entrypoints.api_server

+Use model from www.modelscope.cn
+
+.. code-block:: console
+
+    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
+    $    --model="qwen/Qwen-7B-Chat" \
+    $    --revision="v1.1.8" \
+    $    --trust-remote-code
+
+
 By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.

 Query the model in shell:
@ -95,6 +115,13 @@ Start the server:
    $ python -m vllm.entrypoints.openai.api_server \
    $     --model facebook/opt-125m

+Use model from www.modelscope.cn
+
+.. code-block:: console
+
+    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
+    $     --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
+
 By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.

 This server can be queried in the same format as OpenAI API. For example, list the models:
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@ -81,4 +81,18 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
        output = llm.generate("Hello, my name is")
        print(output)

+    To use model from www.modelscope.cn
+
+    .. code-block:: shell
+
+       $ export VLLM_USE_MODELSCOPE=True
+
+    .. code-block:: python
+
+        from vllm import LLM
+
+        llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
+        output = llm.generate("Hello, my name is")
+        print(output)
+
    If vLLM successfully generates text, it indicates that your model is supported.
--- a/vllm/config.py
+++ b/vllm/config.py
@ -1,4 +1,5 @@
 from typing import Optional, Union
+import os

 import torch
 from transformers import PretrainedConfig
@ -76,7 +77,18 @@ class ModelConfig:
        self.tokenizer_revision = tokenizer_revision
        self.quantization = quantization

-        self.hf_config = get_config(model, trust_remote_code, revision)
+        if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
+            # download model from ModelScope hub,
+            # lazy import so that modelscope is not required for normal use.
+            from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
+            model_path = snapshot_download(model_id=model,
+                                           cache_dir=download_dir,
+                                           revision=revision)
+            self.model = model_path
+            self.download_dir = model_path
+            self.tokenizer = model_path
+
+        self.hf_config = get_config(self.model, trust_remote_code, revision)
        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
        self.max_model_len = _get_and_verify_max_len(self.hf_config,
                                                     max_model_len)
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -648,9 +648,10 @@ if __name__ == "__main__":
    max_model_len = engine_model_config.max_model_len

    # A separate tokenizer to map token IDs to strings.
-    tokenizer = get_tokenizer(engine_args.tokenizer,
-                              tokenizer_mode=engine_args.tokenizer_mode,
-                              trust_remote_code=engine_args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        engine_model_config.tokenizer,
+        tokenizer_mode=engine_model_config.tokenizer_mode,
+        trust_remote_code=engine_model_config.trust_remote_code)

    uvicorn.run(app,
                host=args.host,