mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 11:57:14 +08:00
[Doc]: fixing typos to improve docs (#24480)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
1823a00d67
commit
46876dff32
@ -169,7 +169,7 @@ All Llama 3.1, 3.2 and 4 models should be supported.
|
|||||||
|
|
||||||
The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
|
The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
|
||||||
|
|
||||||
Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
|
Other tool calling formats like the built-in python tool calling or custom tool calling are not supported.
|
||||||
|
|
||||||
Known issues:
|
Known issues:
|
||||||
|
|
||||||
|
|||||||
@ -119,7 +119,7 @@ Currently, there are no pre-built ROCm wheels.
|
|||||||
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
|
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
|
||||||
- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
|
- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
|
||||||
- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
|
- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
|
||||||
- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
|
- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
<|system|>
|
<|system|>
|
||||||
{{ system_message }}
|
{{ system_message }}
|
||||||
{%- if tools %}
|
{%- if tools %}
|
||||||
In addition to plain text responses, you can chose to call one or more of the provided functions.
|
In addition to plain text responses, you can choose to call one or more of the provided functions.
|
||||||
|
|
||||||
Use the following rule to decide when to call a function:
|
Use the following rule to decide when to call a function:
|
||||||
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
|
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
|
||||||
@ -19,7 +19,7 @@ If you decide to call functions:
|
|||||||
* prefix function calls with functools marker (no closing marker required)
|
* prefix function calls with functools marker (no closing marker required)
|
||||||
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
|
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
|
||||||
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
|
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
|
||||||
* respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
|
* respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
|
||||||
* make sure you pick the right functions that match the user intent
|
* make sure you pick the right functions that match the user intent
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
|
|||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
args: tuple = (),
|
args: tuple = (),
|
||||||
kwargs: Optional[dict] = None) -> list[Any]:
|
kwargs: Optional[dict] = None) -> list[Any]:
|
||||||
# Drop marker to show that this was ran
|
# Drop marker to show that this was run
|
||||||
with open(".marker", "w"):
|
with open(".marker", "w"):
|
||||||
...
|
...
|
||||||
return super().collective_rpc(method, timeout, args, kwargs)
|
return super().collective_rpc(method, timeout, args, kwargs)
|
||||||
|
|||||||
@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Need to re-import huggingface_hub
|
# Need to re-import huggingface_hub
|
||||||
# and friends to setup offline mode
|
# and friends to set up offline mode
|
||||||
_re_import_modules()
|
_re_import_modules()
|
||||||
# Cached model files should be used in offline mode
|
# Cached model files should be used in offline mode
|
||||||
for model_config in MODEL_CONFIGS:
|
for model_config in MODEL_CONFIGS:
|
||||||
@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
|
|||||||
disable_connect,
|
disable_connect,
|
||||||
)
|
)
|
||||||
# Need to re-import huggingface_hub
|
# Need to re-import huggingface_hub
|
||||||
# and friends to setup offline mode
|
# and friends to set up offline mode
|
||||||
_re_import_modules()
|
_re_import_modules()
|
||||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||||
LLM(**dataclasses.asdict(engine_args))
|
LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|||||||
@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
|
|||||||
# then we would expand a to:
|
# then we would expand a to:
|
||||||
# a = [[1, 1, 2, 2],
|
# a = [[1, 1, 2, 2],
|
||||||
# [3, 3, 4, 4]]
|
# [3, 3, 4, 4]]
|
||||||
# NOTE this function this function does not explicitly broadcast dimensions
|
# NOTE this function does not explicitly broadcast dimensions
|
||||||
# with an extent of 1, since this can be done implicitly by pytorch
|
# with an extent of 1, since this can be done implicitly by pytorch
|
||||||
def group_broadcast(t, shape):
|
def group_broadcast(t, shape):
|
||||||
for i, s in enumerate(shape):
|
for i, s in enumerate(shape):
|
||||||
|
|||||||
@ -301,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
|||||||
finished_requests_ids is larger than the maximum mamba block capacity.
|
finished_requests_ids is larger than the maximum mamba block capacity.
|
||||||
|
|
||||||
This could generally happen due to the fact that hybrid does support
|
This could generally happen due to the fact that hybrid does support
|
||||||
statelessness mechanism where it can cleanup new incoming requests in
|
statelessness mechanism where it can clean up new incoming requests in
|
||||||
a single step.
|
a single step.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@ -322,7 +322,7 @@ def test_state_cleanup(
|
|||||||
This test is for verifying that the Hybrid state is cleaned up between
|
This test is for verifying that the Hybrid state is cleaned up between
|
||||||
steps.
|
steps.
|
||||||
|
|
||||||
If its not cleaned, an error would be expected.
|
If it's not cleaned, an error would be expected.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||||
|
|||||||
@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [
|
|||||||
expected_value=0.76), # no bias
|
expected_value=0.76), # no bias
|
||||||
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
|
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
|
||||||
# so only one of these tests can run in a single call to pytest. As
|
# so only one of these tests can run in a single call to pytest. As
|
||||||
# a follow up, move this into the LM-EVAL section of the CI.
|
# a follow-up, move this into the LM-EVAL section of the CI.
|
||||||
# GSM8KAccuracyTestConfig(
|
# GSM8KAccuracyTestConfig(
|
||||||
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
|
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
|
||||||
# expected_value=0.66), # bias in QKV layers
|
# expected_value=0.66), # bias in QKV layers
|
||||||
|
|||||||
@ -1117,7 +1117,7 @@ def initialize_model_parallel(
|
|||||||
"decode context model parallel group is already initialized")
|
"decode context model parallel group is already initialized")
|
||||||
# Note(hc): In the current implementation of decode context parallel,
|
# Note(hc): In the current implementation of decode context parallel,
|
||||||
# dcp_size must not exceed tp_size, because the world size does not
|
# dcp_size must not exceed tp_size, because the world size does not
|
||||||
# change by DCP, it simply reuse the GPUs of TP group, and split one
|
# change by DCP, it simply reuses the GPUs of TP group, and split one
|
||||||
# TP group into tp_size//dcp_size DCP groups.
|
# TP group into tp_size//dcp_size DCP groups.
|
||||||
group_ranks = all_ranks.reshape(
|
group_ranks = all_ranks.reshape(
|
||||||
-1, decode_context_model_parallel_size).unbind(0)
|
-1, decode_context_model_parallel_size).unbind(0)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user