mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-09 21:15:41 +08:00
[Doc] Move examples and further reorganize user guide (#18666)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
9553fdb41e
commit
82e2339b06
@ -6,11 +6,6 @@
|
|||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
line-length = 88
|
line-length = 88
|
||||||
exclude = [
|
|
||||||
# External file, leaving license intact
|
|
||||||
"examples/other/fp8/quantizer/quantize.py",
|
|
||||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
"vllm/third_party/**" = ["ALL"]
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
|||||||
@ -246,7 +246,7 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -146,7 +146,7 @@ venv.bak/
|
|||||||
|
|
||||||
# mkdocs documentation
|
# mkdocs documentation
|
||||||
/site
|
/site
|
||||||
docs/getting_started/examples
|
docs/examples
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|||||||
@ -6,11 +6,6 @@
|
|||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
line-length = 88
|
line-length = 88
|
||||||
exclude = [
|
|
||||||
# External file, leaving license intact
|
|
||||||
"examples/other/fp8/quantizer/quantize.py",
|
|
||||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
"vllm/third_party/**" = ["ALL"]
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
|||||||
@ -5,11 +5,9 @@ nav:
|
|||||||
- getting_started/quickstart.md
|
- getting_started/quickstart.md
|
||||||
- getting_started/installation
|
- getting_started/installation
|
||||||
- Examples:
|
- Examples:
|
||||||
- Offline Inference: getting_started/examples/offline_inference
|
- Offline Inference: examples/offline_inference
|
||||||
- Online Serving: getting_started/examples/online_serving
|
- Online Serving: examples/online_serving
|
||||||
- Others:
|
- Others: examples/others
|
||||||
- LMCache: getting_started/examples/lmcache
|
|
||||||
- getting_started/examples/other/*
|
|
||||||
- Quick Links:
|
- Quick Links:
|
||||||
- User Guide: usage/README.md
|
- User Guide: usage/README.md
|
||||||
- Developer Guide: contributing/README.md
|
- Developer Guide: contributing/README.md
|
||||||
@ -19,6 +17,7 @@ nav:
|
|||||||
- Releases: https://github.com/vllm-project/vllm/releases
|
- Releases: https://github.com/vllm-project/vllm/releases
|
||||||
- User Guide:
|
- User Guide:
|
||||||
- Summary: usage/README.md
|
- Summary: usage/README.md
|
||||||
|
- usage/v1_guide.md
|
||||||
- General:
|
- General:
|
||||||
- usage/*
|
- usage/*
|
||||||
- Inference and Serving:
|
- Inference and Serving:
|
||||||
|
|||||||
@ -1,4 +1,9 @@
|
|||||||
# Configuration Options
|
# Configuration Options
|
||||||
|
|
||||||
This section lists the most common options for running the vLLM engine.
|
This section lists the most common options for running vLLM.
|
||||||
For a full list, refer to the [configuration][configuration] page.
|
|
||||||
|
There are three main levels of configuration, from highest priority to lowest priority:
|
||||||
|
|
||||||
|
- [Request parameters][completions-api] and [input arguments][sampling-params]
|
||||||
|
- [Engine arguments](./engine_args.md)
|
||||||
|
- [Environment variables](./env_vars.md)
|
||||||
|
|||||||
@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../..
|
|||||||
|
|
||||||
### Grafana Dashboard
|
### Grafana Dashboard
|
||||||
|
|
||||||
vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||||
|
|
||||||
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
|
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
|
||||||
|
|
||||||
@ -673,7 +673,7 @@ v0 has support for OpenTelemetry tracing:
|
|||||||
- [OpenTelemetry blog
|
- [OpenTelemetry blog
|
||||||
post](https://opentelemetry.io/blog/2024/llm-observability/)
|
post](https://opentelemetry.io/blog/2024/llm-observability/)
|
||||||
- [User-facing
|
- [User-facing
|
||||||
docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html)
|
docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html)
|
||||||
- [Blog
|
- [Blog
|
||||||
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
|
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
|
||||||
- [IBM product
|
- [IBM product
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from typing import Literal
|
|||||||
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
||||||
ROOT_DIR_RELATIVE = '../../../../..'
|
ROOT_DIR_RELATIVE = '../../../../..'
|
||||||
EXAMPLE_DIR = ROOT_DIR / "examples"
|
EXAMPLE_DIR = ROOT_DIR / "examples"
|
||||||
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/getting_started/examples"
|
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
|
||||||
print(ROOT_DIR.resolve())
|
print(ROOT_DIR.resolve())
|
||||||
print(EXAMPLE_DIR.resolve())
|
print(EXAMPLE_DIR.resolve())
|
||||||
print(EXAMPLE_DOC_DIR.resolve())
|
print(EXAMPLE_DOC_DIR.resolve())
|
||||||
|
|||||||
@ -10,7 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
|
|||||||
|
|
||||||
For more information on CoreWeave's Tensorizer, please refer to
|
For more information on CoreWeave's Tensorizer, please refer to
|
||||||
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
|
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
|
||||||
the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html).
|
the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
|
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
|
||||||
|
|||||||
@ -6,6 +6,6 @@ vLLM can be used to generate the completions for RLHF. The best way to do this i
|
|||||||
|
|
||||||
See the following basic examples to get started if you don't want to use an existing library:
|
See the following basic examples to get started if you don't want to use an existing library:
|
||||||
|
|
||||||
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf.html)
|
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
|
||||||
- [Training and inference processes are colocated on the same GPUs using Ray](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_colocate.html)
|
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
|
||||||
- [Utilities for performing RLHF with vLLM](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_utils.html)
|
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
|
||||||
|
|||||||
@ -28,7 +28,7 @@ https://github.com/coreweave/tensorizer
|
|||||||
To serialize a model, install vLLM from source, then run something
|
To serialize a model, install vLLM from source, then run something
|
||||||
like this from the root level of this repository:
|
like this from the root level of this repository:
|
||||||
|
|
||||||
python examples/other/tensorize_vllm_model.py \
|
python examples/others/tensorize_vllm_model.py \
|
||||||
--model facebook/opt-125m \
|
--model facebook/opt-125m \
|
||||||
serialize \
|
serialize \
|
||||||
--serialized-directory s3://my-bucket \
|
--serialized-directory s3://my-bucket \
|
||||||
@ -48,7 +48,7 @@ providing a `--keyfile` argument.
|
|||||||
To deserialize a model, you can run something like this from the root
|
To deserialize a model, you can run something like this from the root
|
||||||
level of this repository:
|
level of this repository:
|
||||||
|
|
||||||
python examples/other/tensorize_vllm_model.py \
|
python examples/others/tensorize_vllm_model.py \
|
||||||
--model EleutherAI/gpt-j-6B \
|
--model EleutherAI/gpt-j-6B \
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
deserialize \
|
deserialize \
|
||||||
@ -66,11 +66,11 @@ shard's rank. Sharded models serialized with this script will be named as
|
|||||||
model-rank-%03d.tensors
|
model-rank-%03d.tensors
|
||||||
|
|
||||||
For more information on the available arguments for serializing, run
|
For more information on the available arguments for serializing, run
|
||||||
`python -m examples.other.tensorize_vllm_model serialize --help`.
|
`python -m examples.others.tensorize_vllm_model serialize --help`.
|
||||||
|
|
||||||
Or for deserializing:
|
Or for deserializing:
|
||||||
|
|
||||||
`python examples/other/tensorize_vllm_model.py deserialize --help`.
|
`python examples/others/tensorize_vllm_model.py deserialize --help`.
|
||||||
|
|
||||||
Once a model is serialized, tensorizer can be invoked with the `LLM` class
|
Once a model is serialized, tensorizer can be invoked with the `LLM` class
|
||||||
directly to load models:
|
directly to load models:
|
||||||
@ -91,7 +91,7 @@ TensorizerConfig arguments desired.
|
|||||||
In order to see all of the available arguments usable to configure
|
In order to see all of the available arguments usable to configure
|
||||||
loading with tensorizer that are given to `TensorizerConfig`, run:
|
loading with tensorizer that are given to `TensorizerConfig`, run:
|
||||||
|
|
||||||
`python examples/other/tensorize_vllm_model.py deserialize --help`
|
`python examples/others/tensorize_vllm_model.py deserialize --help`
|
||||||
|
|
||||||
under the `tensorizer options` section. These can also be used for
|
under the `tensorizer options` section. These can also be used for
|
||||||
deserialization in this example script, although `--tensorizer-uri` and
|
deserialization in this example script, although `--tensorizer-uri` and
|
||||||
@ -62,11 +62,6 @@ ignore_patterns = [
|
|||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
# Allow lines to be as long as 80.
|
# Allow lines to be as long as 80.
|
||||||
line-length = 80
|
line-length = 80
|
||||||
exclude = [
|
|
||||||
# External file, leaving license intact
|
|
||||||
"examples/other/fp8/quantizer/quantize.py",
|
|
||||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
"vllm/third_party/**" = ["ALL"]
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
|||||||
@ -41,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
|
|||||||
depyf==0.18.0 # required for profiling and debugging with compilation config
|
depyf==0.18.0 # required for profiling and debugging with compilation config
|
||||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||||
watchfiles # required for http server to monitor the updates of TLS files
|
watchfiles # required for http server to monitor the updates of TLS files
|
||||||
python-json-logger # Used by logging as per examples/other/logging_configuration.md
|
python-json-logger # Used by logging as per examples/others/logging_configuration.md
|
||||||
scipy # Required for phi-4-multimodal-instruct
|
scipy # Required for phi-4-multimodal-instruct
|
||||||
ninja # Required for xgrammar, rocm, tpu, xpu
|
ninja # Required for xgrammar, rocm, tpu, xpu
|
||||||
opentelemetry-sdk>=1.26.0 # vllm.tracing
|
opentelemetry-sdk>=1.26.0 # vllm.tracing
|
||||||
|
|||||||
@ -207,7 +207,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
|||||||
try:
|
try:
|
||||||
result = subprocess.run([
|
result = subprocess.run([
|
||||||
sys.executable,
|
sys.executable,
|
||||||
f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model",
|
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
|
||||||
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
|
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
|
||||||
str(tp_size), "serialize", "--serialized-directory",
|
str(tp_size), "serialize", "--serialized-directory",
|
||||||
str(tmp_path), "--suffix", suffix
|
str(tmp_path), "--suffix", suffix
|
||||||
|
|||||||
@ -251,7 +251,7 @@ class TensorizerArgs:
|
|||||||
encryption_keyfile: File path to a binary file containing a
|
encryption_keyfile: File path to a binary file containing a
|
||||||
binary key to use for decryption. `None` (the default) means
|
binary key to use for decryption. `None` (the default) means
|
||||||
no decryption. See the example script in
|
no decryption. See the example script in
|
||||||
examples/other/tensorize_vllm_model.py.
|
examples/others/tensorize_vllm_model.py.
|
||||||
s3_access_key_id: The access key for the S3 bucket. Can also be set via
|
s3_access_key_id: The access key for the S3 bucket. Can also be set via
|
||||||
the S3_ACCESS_KEY_ID environment variable.
|
the S3_ACCESS_KEY_ID environment variable.
|
||||||
s3_secret_access_key: The secret access key for the S3 bucket. Can also
|
s3_secret_access_key: The secret access key for the S3 bucket. Can also
|
||||||
@ -469,7 +469,7 @@ def tensorizer_weights_iterator(
|
|||||||
"loading on vLLM, as tensorizer is forced to load to CPU. "
|
"loading on vLLM, as tensorizer is forced to load to CPU. "
|
||||||
"Consider deserializing a vLLM model instead for faster "
|
"Consider deserializing a vLLM model instead for faster "
|
||||||
"load times. See the "
|
"load times. See the "
|
||||||
"examples/other/tensorize_vllm_model.py example script "
|
"examples/others/tensorize_vllm_model.py example script "
|
||||||
"for serializing vLLM models.")
|
"for serializing vLLM models.")
|
||||||
|
|
||||||
deserializer_args = tensorizer_args.deserializer_params
|
deserializer_args = tensorizer_args.deserializer_params
|
||||||
|
|||||||
@ -48,7 +48,7 @@ class TensorizerLoader(BaseModelLoader):
|
|||||||
"""Load a serialized model with tensorizer to the CPU.
|
"""Load a serialized model with tensorizer to the CPU.
|
||||||
|
|
||||||
This is only necessary when the model isn't vLLM-tensorized (see
|
This is only necessary when the model isn't vLLM-tensorized (see
|
||||||
examples/other/tensorize_vllm_model.py) This should still
|
examples/others/tensorize_vllm_model.py) This should still
|
||||||
be faster than default HuggingFace loading, but will be slower than
|
be faster than default HuggingFace loading, but will be slower than
|
||||||
loading a vLLM-tensorized model.
|
loading a vLLM-tensorized model.
|
||||||
"""
|
"""
|
||||||
@ -68,7 +68,7 @@ class TensorizerLoader(BaseModelLoader):
|
|||||||
"""Load a serialized model with tensorizer.
|
"""Load a serialized model with tensorizer.
|
||||||
|
|
||||||
Expects a vLLM-tensorized model. See the
|
Expects a vLLM-tensorized model. See the
|
||||||
examples/other/tensorize_vllm_model.py example script
|
examples/others/tensorize_vllm_model.py example script
|
||||||
for serializing vLLM models."""
|
for serializing vLLM models."""
|
||||||
|
|
||||||
device_config = vllm_config.device_config
|
device_config = vllm_config.device_config
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user