mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 00:25:45 +08:00
[Doc]: fix typos in Python scripts (#23828)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
bfab219648
commit
d3da2eea54
@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
|
|||||||
outputs.append(
|
outputs.append(
|
||||||
SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
|
SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
|
||||||
|
|
||||||
# sort by intetger graph_id, rather than string name
|
# sort by integer graph_id, rather than string name
|
||||||
outputs.sort(key=lambda x: x.graph_id)
|
outputs.sort(key=lambda x: x.graph_id)
|
||||||
|
|
||||||
return split_gm, outputs
|
return split_gm, outputs
|
||||||
@ -424,7 +424,7 @@ class VllmBackend:
|
|||||||
|
|
||||||
# if the model is initialized with a non-empty prefix,
|
# if the model is initialized with a non-empty prefix,
|
||||||
# then usually it's enough to use that prefix,
|
# then usually it's enough to use that prefix,
|
||||||
# e.g. launguage_model, vision_model, etc.
|
# e.g. language_model, vision_model, etc.
|
||||||
# when multiple parts are initialized as independent
|
# when multiple parts are initialized as independent
|
||||||
# models, we need to use the model_tag to distinguish
|
# models, we need to use the model_tag to distinguish
|
||||||
# them, e.g. backbone (default), eagle_head, etc.
|
# them, e.g. backbone (default), eagle_head, etc.
|
||||||
|
|||||||
@ -115,7 +115,7 @@ class CacheConfig:
|
|||||||
|
|
||||||
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
|
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
|
||||||
some layers can skip tokens corresponding to prefill. This flag enables
|
some layers can skip tokens corresponding to prefill. This flag enables
|
||||||
attention metadata for eligible layers to be overriden with metadata
|
attention metadata for eligible layers to be overridden with metadata
|
||||||
necessary for implementing this optimization in some models (e.g. Gemma3n)
|
necessary for implementing this optimization in some models (e.g. Gemma3n)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -1053,7 +1053,7 @@ class EngineArgs:
|
|||||||
self.trust_remote_code, self.revision,
|
self.trust_remote_code, self.revision,
|
||||||
self.code_revision, self.config_format)
|
self.code_revision, self.config_format)
|
||||||
|
|
||||||
# if loading a SpeculatorsConfig, load the specualtive_config
|
# if loading a SpeculatorsConfig, load the speculative_config
|
||||||
# details from the config directly
|
# details from the config directly
|
||||||
# no user input required / expected
|
# no user input required / expected
|
||||||
if isinstance(hf_config, SpeculatorsConfig):
|
if isinstance(hf_config, SpeculatorsConfig):
|
||||||
|
|||||||
@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# stores model placehodlers list with corresponding
|
# stores model placeholders list with corresponding
|
||||||
# general MM placeholder:
|
# general MM placeholder:
|
||||||
# {
|
# {
|
||||||
# "<##IMAGE##>": ["<image>", "<image>", "<image>"],
|
# "<##IMAGE##>": ["<image>", "<image>", "<image>"],
|
||||||
|
|||||||
@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE:
|
|||||||
raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
|
raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
|
||||||
detail="Missing 'method' in request body")
|
detail="Missing 'method' in request body")
|
||||||
# For security reason, only serialized string args/kwargs are passed.
|
# For security reason, only serialized string args/kwargs are passed.
|
||||||
# User-defined `method` is responsible for deseralization if needed.
|
# User-defined `method` is responsible for deserialization if needed.
|
||||||
args: list[str] = body.get("args", [])
|
args: list[str] = body.get("args", [])
|
||||||
kwargs: dict[str, str] = body.get("kwargs", {})
|
kwargs: dict[str, str] = body.get("kwargs", {})
|
||||||
timeout: Optional[float] = body.get("timeout")
|
timeout: Optional[float] = body.get("timeout")
|
||||||
|
|||||||
@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
format = scheme_dict.get("format")
|
format = scheme_dict.get("format")
|
||||||
|
|
||||||
# Find the sparsity scheme of the layer
|
# Find the sparsity scheme of the layer
|
||||||
# assume that fused layers inerhit first component's sparsity scheme
|
# assume that fused layers inherit first component's sparsity scheme
|
||||||
sparsity_targets = (self.sparsity_scheme_map.keys() -
|
sparsity_targets = (self.sparsity_scheme_map.keys() -
|
||||||
set(self.sparsity_ignore_list))
|
set(self.sparsity_ignore_list))
|
||||||
sparsity_scheme: Optional[SparsityCompressionConfig] = None
|
sparsity_scheme: Optional[SparsityCompressionConfig] = None
|
||||||
|
|||||||
@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
|||||||
) -> "CompressedTensorsMoEMethod":
|
) -> "CompressedTensorsMoEMethod":
|
||||||
# TODO: @dsikka: refactor this to use schemes as other kernels
|
# TODO: @dsikka: refactor this to use schemes as other kernels
|
||||||
# are supported + check if the layer is being ignored.
|
# are supported + check if the layer is being ignored.
|
||||||
# Check if a using "Linear" to select scheems
|
# Check if a using "Linear" to select schemes
|
||||||
if "Linear" in quant_config.target_scheme_map:
|
if "Linear" in quant_config.target_scheme_map:
|
||||||
matched_target = "Linear"
|
matched_target = "Linear"
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -11,7 +11,7 @@ logger = init_logger(__name__)
|
|||||||
|
|
||||||
class CudagraphDispatcher:
|
class CudagraphDispatcher:
|
||||||
"""
|
"""
|
||||||
Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
|
Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs.
|
||||||
|
|
||||||
The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
|
The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
|
||||||
for FULL cudagraph runtime mode. The keys are initialized depending on
|
for FULL cudagraph runtime mode. The keys are initialized depending on
|
||||||
@ -21,7 +21,7 @@ class CudagraphDispatcher:
|
|||||||
|
|
||||||
At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
|
At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
|
||||||
PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
|
PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
|
||||||
based on the input key. After dispatching (commuicate via forward context),
|
based on the input key. After dispatching (communicate via forward context),
|
||||||
the cudagraph wrappers will trust the dispatch key to do either capturing
|
the cudagraph wrappers will trust the dispatch key to do either capturing
|
||||||
or replaying (if mode matched), or pass through to the underlying runnable
|
or replaying (if mode matched), or pass through to the underlying runnable
|
||||||
without cudagraph (if mode no match or mode is NONE).
|
without cudagraph (if mode no match or mode is NONE).
|
||||||
|
|||||||
@ -110,7 +110,7 @@ class BlockTable:
|
|||||||
self.block_table_cpu.fill_(0)
|
self.block_table_cpu.fill_(0)
|
||||||
|
|
||||||
def get_device_tensor(self) -> torch.Tensor:
|
def get_device_tensor(self) -> torch.Tensor:
|
||||||
"""Ruturns the device tensor of the block table."""
|
"""Returns the device tensor of the block table."""
|
||||||
return self.block_table
|
return self.block_table
|
||||||
|
|
||||||
def get_cpu_tensor(self) -> torch.Tensor:
|
def get_cpu_tensor(self) -> torch.Tensor:
|
||||||
|
|||||||
@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner):
|
|||||||
Args:
|
Args:
|
||||||
scheduler_output: The scheduler output.
|
scheduler_output: The scheduler output.
|
||||||
"""
|
"""
|
||||||
# Attention free models have zero kv_cache_goups, however models
|
# Attention free models have zero kv_cache_groups, however models
|
||||||
# like Mamba are also attention free but use the kv_cache for
|
# like Mamba are also attention free but use the kv_cache for
|
||||||
# keeping its internal state. This is why we check the number
|
# keeping its internal state. This is why we check the number
|
||||||
# of kv_cache groups instead of solely checking
|
# of kv_cache groups instead of solely checking
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user