mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-21 17:47:06 +08:00
Support download models from www.modelscope.cn (#1588)
This commit is contained in:
parent
bb00f66e19
commit
edb305584b
@ -40,6 +40,16 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
|
|||||||
|
|
||||||
llm = LLM(model="facebook/opt-125m")
|
llm = LLM(model="facebook/opt-125m")
|
||||||
|
|
||||||
|
Use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export VLLM_USE_MODELSCOPE=True
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
|
||||||
|
|
||||||
Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
|
Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
@ -67,6 +77,16 @@ Start the server:
|
|||||||
|
|
||||||
$ python -m vllm.entrypoints.api_server
|
$ python -m vllm.entrypoints.api_server
|
||||||
|
|
||||||
|
Use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
|
||||||
|
$ --model="qwen/Qwen-7B-Chat" \
|
||||||
|
$ --revision="v1.1.8" \
|
||||||
|
$ --trust-remote-code
|
||||||
|
|
||||||
|
|
||||||
By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
|
By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
|
||||||
|
|
||||||
Query the model in shell:
|
Query the model in shell:
|
||||||
@ -95,6 +115,13 @@ Start the server:
|
|||||||
$ python -m vllm.entrypoints.openai.api_server \
|
$ python -m vllm.entrypoints.openai.api_server \
|
||||||
$ --model facebook/opt-125m
|
$ --model facebook/opt-125m
|
||||||
|
|
||||||
|
Use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
|
||||||
|
$ --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
|
||||||
|
|
||||||
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
|
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
|
||||||
|
|
||||||
This server can be queried in the same format as OpenAI API. For example, list the models:
|
This server can be queried in the same format as OpenAI API. For example, list the models:
|
||||||
|
|||||||
@ -81,4 +81,18 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
|
|||||||
output = llm.generate("Hello, my name is")
|
output = llm.generate("Hello, my name is")
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
|
To use model from www.modelscope.cn
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
$ export VLLM_USE_MODELSCOPE=True
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
|
llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model
|
||||||
|
output = llm.generate("Hello, my name is")
|
||||||
|
print(output)
|
||||||
|
|
||||||
If vLLM successfully generates text, it indicates that your model is supported.
|
If vLLM successfully generates text, it indicates that your model is supported.
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
import os
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
@ -76,7 +77,18 @@ class ModelConfig:
|
|||||||
self.tokenizer_revision = tokenizer_revision
|
self.tokenizer_revision = tokenizer_revision
|
||||||
self.quantization = quantization
|
self.quantization = quantization
|
||||||
|
|
||||||
self.hf_config = get_config(model, trust_remote_code, revision)
|
if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
||||||
|
# download model from ModelScope hub,
|
||||||
|
# lazy import so that modelscope is not required for normal use.
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C
|
||||||
|
model_path = snapshot_download(model_id=model,
|
||||||
|
cache_dir=download_dir,
|
||||||
|
revision=revision)
|
||||||
|
self.model = model_path
|
||||||
|
self.download_dir = model_path
|
||||||
|
self.tokenizer = model_path
|
||||||
|
|
||||||
|
self.hf_config = get_config(self.model, trust_remote_code, revision)
|
||||||
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||||
self.max_model_len = _get_and_verify_max_len(self.hf_config,
|
self.max_model_len = _get_and_verify_max_len(self.hf_config,
|
||||||
max_model_len)
|
max_model_len)
|
||||||
|
|||||||
@ -648,9 +648,10 @@ if __name__ == "__main__":
|
|||||||
max_model_len = engine_model_config.max_model_len
|
max_model_len = engine_model_config.max_model_len
|
||||||
|
|
||||||
# A separate tokenizer to map token IDs to strings.
|
# A separate tokenizer to map token IDs to strings.
|
||||||
tokenizer = get_tokenizer(engine_args.tokenizer,
|
tokenizer = get_tokenizer(
|
||||||
tokenizer_mode=engine_args.tokenizer_mode,
|
engine_model_config.tokenizer,
|
||||||
trust_remote_code=engine_args.trust_remote_code)
|
tokenizer_mode=engine_model_config.tokenizer_mode,
|
||||||
|
trust_remote_code=engine_model_config.trust_remote_code)
|
||||||
|
|
||||||
uvicorn.run(app,
|
uvicorn.run(app,
|
||||||
host=args.host,
|
host=args.host,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user