[Doc] Move examples and further reorganize user guide (#18666)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-05-26 22:38:04 +08:00 committed by GitHub
parent 9553fdb41e
commit 82e2339b06
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 31 additions and 42 deletions

View File

@ -6,11 +6,6 @@
[tool.ruff] [tool.ruff]
line-length = 88 line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]

View File

@ -246,7 +246,7 @@ steps:
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_embedding.py --seed 0 - python3 offline_inference/vision_language_embedding.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py

2
.gitignore vendored
View File

@ -146,7 +146,7 @@ venv.bak/
# mkdocs documentation # mkdocs documentation
/site /site
docs/getting_started/examples docs/examples
# mypy # mypy
.mypy_cache/ .mypy_cache/

View File

@ -6,11 +6,6 @@
[tool.ruff] [tool.ruff]
line-length = 88 line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]

View File

@ -5,11 +5,9 @@ nav:
- getting_started/quickstart.md - getting_started/quickstart.md
- getting_started/installation - getting_started/installation
- Examples: - Examples:
- Offline Inference: getting_started/examples/offline_inference - Offline Inference: examples/offline_inference
- Online Serving: getting_started/examples/online_serving - Online Serving: examples/online_serving
- Others: - Others: examples/others
- LMCache: getting_started/examples/lmcache
- getting_started/examples/other/*
- Quick Links: - Quick Links:
- User Guide: usage/README.md - User Guide: usage/README.md
- Developer Guide: contributing/README.md - Developer Guide: contributing/README.md
@ -19,6 +17,7 @@ nav:
- Releases: https://github.com/vllm-project/vllm/releases - Releases: https://github.com/vllm-project/vllm/releases
- User Guide: - User Guide:
- Summary: usage/README.md - Summary: usage/README.md
- usage/v1_guide.md
- General: - General:
- usage/* - usage/*
- Inference and Serving: - Inference and Serving:

View File

@ -1,4 +1,9 @@
# Configuration Options # Configuration Options
This section lists the most common options for running the vLLM engine. This section lists the most common options for running vLLM.
For a full list, refer to the [configuration][configuration] page.
There are three main levels of configuration, from highest priority to lowest priority:
- [Request parameters][completions-api] and [input arguments][sampling-params]
- [Engine arguments](./engine_args.md)
- [Environment variables](./env_vars.md)

View File

@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../..
### Grafana Dashboard ### Grafana Dashboard
vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
@ -673,7 +673,7 @@ v0 has support for OpenTelemetry tracing:
- [OpenTelemetry blog - [OpenTelemetry blog
post](https://opentelemetry.io/blog/2024/llm-observability/) post](https://opentelemetry.io/blog/2024/llm-observability/)
- [User-facing - [User-facing
docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html) docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html)
- [Blog - [Blog
post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
- [IBM product - [IBM product

View File

@ -9,7 +9,7 @@ from typing import Literal
ROOT_DIR = Path(__file__).parent.parent.parent.parent ROOT_DIR = Path(__file__).parent.parent.parent.parent
ROOT_DIR_RELATIVE = '../../../../..' ROOT_DIR_RELATIVE = '../../../../..'
EXAMPLE_DIR = ROOT_DIR / "examples" EXAMPLE_DIR = ROOT_DIR / "examples"
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/getting_started/examples" EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
print(ROOT_DIR.resolve()) print(ROOT_DIR.resolve())
print(EXAMPLE_DIR.resolve()) print(EXAMPLE_DIR.resolve())
print(EXAMPLE_DOC_DIR.resolve()) print(EXAMPLE_DOC_DIR.resolve())

View File

@ -10,7 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
For more information on CoreWeave's Tensorizer, please refer to For more information on CoreWeave's Tensorizer, please refer to
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html). the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html).
!!! note !!! note
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.

View File

@ -6,6 +6,6 @@ vLLM can be used to generate the completions for RLHF. The best way to do this i
See the following basic examples to get started if you don't want to use an existing library: See the following basic examples to get started if you don't want to use an existing library:
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf.html) - [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
- [Training and inference processes are colocated on the same GPUs using Ray](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_colocate.html) - [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
- [Utilities for performing RLHF with vLLM](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_utils.html) - [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)

View File

@ -28,7 +28,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something To serialize a model, install vLLM from source, then run something
like this from the root level of this repository: like this from the root level of this repository:
python examples/other/tensorize_vllm_model.py \ python examples/others/tensorize_vllm_model.py \
--model facebook/opt-125m \ --model facebook/opt-125m \
serialize \ serialize \
--serialized-directory s3://my-bucket \ --serialized-directory s3://my-bucket \
@ -48,7 +48,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root To deserialize a model, you can run something like this from the root
level of this repository: level of this repository:
python examples/other/tensorize_vllm_model.py \ python examples/others/tensorize_vllm_model.py \
--model EleutherAI/gpt-j-6B \ --model EleutherAI/gpt-j-6B \
--dtype float16 \ --dtype float16 \
deserialize \ deserialize \
@ -66,11 +66,11 @@ shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors model-rank-%03d.tensors
For more information on the available arguments for serializing, run For more information on the available arguments for serializing, run
`python -m examples.other.tensorize_vllm_model serialize --help`. `python -m examples.others.tensorize_vllm_model serialize --help`.
Or for deserializing: Or for deserializing:
`python examples/other/tensorize_vllm_model.py deserialize --help`. `python examples/others/tensorize_vllm_model.py deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models: directly to load models:
@ -91,7 +91,7 @@ TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run: loading with tensorizer that are given to `TensorizerConfig`, run:
`python examples/other/tensorize_vllm_model.py deserialize --help` `python examples/others/tensorize_vllm_model.py deserialize --help`
under the `tensorizer options` section. These can also be used for under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and deserialization in this example script, although `--tensorizer-uri` and

View File

@ -62,11 +62,6 @@ ignore_patterns = [
[tool.ruff] [tool.ruff]
# Allow lines to be as long as 80. # Allow lines to be as long as 80.
line-length = 80 line-length = 80
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]

View File

@ -41,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md python-json-logger # Used by logging as per examples/others/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0 # vllm.tracing opentelemetry-sdk>=1.26.0 # vllm.tracing

View File

@ -207,7 +207,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
try: try:
result = subprocess.run([ result = subprocess.run([
sys.executable, sys.executable,
f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model", f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size", MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory", str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix str(tmp_path), "--suffix", suffix

View File

@ -251,7 +251,7 @@ class TensorizerArgs:
encryption_keyfile: File path to a binary file containing a encryption_keyfile: File path to a binary file containing a
binary key to use for decryption. `None` (the default) means binary key to use for decryption. `None` (the default) means
no decryption. See the example script in no decryption. See the example script in
examples/other/tensorize_vllm_model.py. examples/others/tensorize_vllm_model.py.
s3_access_key_id: The access key for the S3 bucket. Can also be set via s3_access_key_id: The access key for the S3 bucket. Can also be set via
the S3_ACCESS_KEY_ID environment variable. the S3_ACCESS_KEY_ID environment variable.
s3_secret_access_key: The secret access key for the S3 bucket. Can also s3_secret_access_key: The secret access key for the S3 bucket. Can also
@ -469,7 +469,7 @@ def tensorizer_weights_iterator(
"loading on vLLM, as tensorizer is forced to load to CPU. " "loading on vLLM, as tensorizer is forced to load to CPU. "
"Consider deserializing a vLLM model instead for faster " "Consider deserializing a vLLM model instead for faster "
"load times. See the " "load times. See the "
"examples/other/tensorize_vllm_model.py example script " "examples/others/tensorize_vllm_model.py example script "
"for serializing vLLM models.") "for serializing vLLM models.")
deserializer_args = tensorizer_args.deserializer_params deserializer_args = tensorizer_args.deserializer_params

View File

@ -48,7 +48,7 @@ class TensorizerLoader(BaseModelLoader):
"""Load a serialized model with tensorizer to the CPU. """Load a serialized model with tensorizer to the CPU.
This is only necessary when the model isn't vLLM-tensorized (see This is only necessary when the model isn't vLLM-tensorized (see
examples/other/tensorize_vllm_model.py) This should still examples/others/tensorize_vllm_model.py) This should still
be faster than default HuggingFace loading, but will be slower than be faster than default HuggingFace loading, but will be slower than
loading a vLLM-tensorized model. loading a vLLM-tensorized model.
""" """
@ -68,7 +68,7 @@ class TensorizerLoader(BaseModelLoader):
"""Load a serialized model with tensorizer. """Load a serialized model with tensorizer.
Expects a vLLM-tensorized model. See the Expects a vLLM-tensorized model. See the
examples/other/tensorize_vllm_model.py example script examples/others/tensorize_vllm_model.py example script
for serializing vLLM models.""" for serializing vLLM models."""
device_config = vllm_config.device_config device_config = vllm_config.device_config