mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-08 20:22:15 +08:00
[Doc]: fix typos in Python comments (#24001)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
749be00a98
commit
9701352e4b
@ -43,7 +43,7 @@ cudagraph_capturing_enabled: bool = True
|
|||||||
|
|
||||||
|
|
||||||
def validate_cudagraph_capturing_enabled():
|
def validate_cudagraph_capturing_enabled():
|
||||||
# used to monitor whether an cudagraph capturing is legal at runtime.
|
# used to monitor whether a cudagraph capturing is legal at runtime.
|
||||||
# should be called before any cudagraph capturing.
|
# should be called before any cudagraph capturing.
|
||||||
# if an illegal cudagraph capturing happens, raise an error.
|
# if an illegal cudagraph capturing happens, raise an error.
|
||||||
global cudagraph_capturing_enabled
|
global cudagraph_capturing_enabled
|
||||||
|
|||||||
@ -76,7 +76,7 @@ class LRUEvictor(Evictor):
|
|||||||
that's recorded in the Block. If there are multiple blocks with
|
that's recorded in the Block. If there are multiple blocks with
|
||||||
the same last_accessed time, then the one with the largest num_hashed_tokens
|
the same last_accessed time, then the one with the largest num_hashed_tokens
|
||||||
will be evicted. If two blocks each have the lowest last_accessed time and
|
will be evicted. If two blocks each have the lowest last_accessed time and
|
||||||
highest num_hashed_tokens value, then one will be chose arbitrarily
|
highest num_hashed_tokens value, then one will be chosen arbitrarily
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# CLEANUP_THRESHOLD determines the maximum allowable size of the priority
|
# CLEANUP_THRESHOLD determines the maximum allowable size of the priority
|
||||||
|
|||||||
@ -1239,7 +1239,7 @@ class LLMEngine:
|
|||||||
|
|
||||||
# Stop the execute model loop in parallel workers until there are
|
# Stop the execute model loop in parallel workers until there are
|
||||||
# more requests to process. This avoids waiting indefinitely in
|
# more requests to process. This avoids waiting indefinitely in
|
||||||
# torch.distributed ops which may otherwise timeout, and unblocks
|
# torch.distributed ops which may otherwise time out, and unblocks
|
||||||
# the RPC thread in the workers so that they can process any other
|
# the RPC thread in the workers so that they can process any other
|
||||||
# queued control plane messages, such as add/remove lora adapters.
|
# queued control plane messages, such as add/remove lora adapters.
|
||||||
logger.debug("Stopping remote worker execution loop.")
|
logger.debug("Stopping remote worker execution loop.")
|
||||||
|
|||||||
@ -329,7 +329,7 @@ class LLM:
|
|||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompt.
|
||||||
sampling_params: The sampling parameters for text generation. If
|
sampling_params: The sampling parameters for text generation. If
|
||||||
None, we use the default sampling parameters.
|
None, we use the default sampling parameters.
|
||||||
When it is a single value, it is applied to every prompt.
|
When it is a single value, it is applied to every prompt.
|
||||||
@ -853,7 +853,7 @@ class LLM:
|
|||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompt.
|
||||||
pooling_params: The pooling parameters for pooling. If None, we
|
pooling_params: The pooling parameters for pooling. If None, we
|
||||||
use the default pooling parameters.
|
use the default pooling parameters.
|
||||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||||
@ -946,7 +946,7 @@ class LLM:
|
|||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompt.
|
||||||
pooling_params: The pooling parameters for pooling. If None, we
|
pooling_params: The pooling parameters for pooling. If None, we
|
||||||
use the default pooling parameters.
|
use the default pooling parameters.
|
||||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||||
@ -994,7 +994,7 @@ class LLM:
|
|||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompt.
|
||||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||||
it is used to create the progress bar.
|
it is used to create the progress bar.
|
||||||
@ -1038,7 +1038,7 @@ class LLM:
|
|||||||
Args:
|
Args:
|
||||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||||
for more details about the format of each prompts.
|
for more details about the format of each prompt.
|
||||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||||
it is used to create the progress bar.
|
it is used to create the progress bar.
|
||||||
|
|||||||
@ -101,7 +101,7 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
|
|||||||
result_handler.start()
|
result_handler.start()
|
||||||
self.worker_monitor.start()
|
self.worker_monitor.start()
|
||||||
|
|
||||||
# Set up signal handlers to shutdown the executor cleanly
|
# Set up signal handlers to shut down the executor cleanly
|
||||||
# sometimes gc does not work well
|
# sometimes gc does not work well
|
||||||
|
|
||||||
self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
|
self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
|
||||||
|
|||||||
@ -605,7 +605,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
|||||||
|
|
||||||
class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
||||||
"""ColumnParallelLinear layer that is composed of 2 sublayers (slices)
|
"""ColumnParallelLinear layer that is composed of 2 sublayers (slices)
|
||||||
packed together (eg. gate_proj + up_proj -> gate_up_proj).
|
packed together (e.g. gate_proj + up_proj -> gate_up_proj).
|
||||||
|
|
||||||
This means we have 2 LoRAs, each applied to one half of the layer.
|
This means we have 2 LoRAs, each applied to one half of the layer.
|
||||||
|
|
||||||
|
|||||||
@ -537,7 +537,7 @@ class Platform:
|
|||||||
|
|
||||||
def get_global_graph_pool(self) -> Any:
|
def get_global_graph_pool(self) -> Any:
|
||||||
"""
|
"""
|
||||||
Return the global graph pool for the this platform.
|
Return the global graph pool for this platform.
|
||||||
"""
|
"""
|
||||||
cls = self.__class__
|
cls = self.__class__
|
||||||
if cls._global_graph_pool is None:
|
if cls._global_graph_pool is None:
|
||||||
|
|||||||
@ -30,7 +30,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
|
|||||||
Key Features:
|
Key Features:
|
||||||
- For non-stream output , Recognizes and extracts reasoning ("think")
|
- For non-stream output , Recognizes and extracts reasoning ("think")
|
||||||
and answer ("answer") sections from text using regular expressions.
|
and answer ("answer") sections from text using regular expressions.
|
||||||
- For stream process, it require a token id sequences to change the
|
- For stream process, it requires a token id sequences to change the
|
||||||
reasoning state and other state so it maintains internal state to
|
reasoning state and other state so it maintains internal state to
|
||||||
manage parsing across multiple token.
|
manage parsing across multiple token.
|
||||||
|
|
||||||
|
|||||||
@ -2734,7 +2734,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
layer_names)
|
layer_names)
|
||||||
attn_backends = {}
|
attn_backends = {}
|
||||||
attn_backend_layers = defaultdict(list)
|
attn_backend_layers = defaultdict(list)
|
||||||
# Dedupe based on full class name; this is a bit safer than using
|
# Dedupe based on full class name; this is a bit safer than
|
||||||
# using the class itself as the key because when we create dynamic
|
# using the class itself as the key because when we create dynamic
|
||||||
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
|
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
|
||||||
# they are cached correctly, there will be different objects per
|
# they are cached correctly, there will be different objects per
|
||||||
|
|||||||
@ -224,7 +224,7 @@ class Worker(WorkerBase):
|
|||||||
memory can be used for KV cache without OOMs.
|
memory can be used for KV cache without OOMs.
|
||||||
|
|
||||||
The engine will first conduct a profiling of the existing memory usage.
|
The engine will first conduct a profiling of the existing memory usage.
|
||||||
Then, it calculate the free memory that can be used for KV cache in
|
Then, it calculates the free memory that can be used for KV cache in
|
||||||
bytes.
|
bytes.
|
||||||
|
|
||||||
Tip:
|
Tip:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user