mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 07:15:01 +08:00
[Doc]: fix typos in Python scripts (#23828)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
bfab219648
commit
d3da2eea54
@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
|
||||
outputs.append(
|
||||
SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
|
||||
|
||||
# sort by intetger graph_id, rather than string name
|
||||
# sort by integer graph_id, rather than string name
|
||||
outputs.sort(key=lambda x: x.graph_id)
|
||||
|
||||
return split_gm, outputs
|
||||
@ -424,7 +424,7 @@ class VllmBackend:
|
||||
|
||||
# if the model is initialized with a non-empty prefix,
|
||||
# then usually it's enough to use that prefix,
|
||||
# e.g. launguage_model, vision_model, etc.
|
||||
# e.g. language_model, vision_model, etc.
|
||||
# when multiple parts are initialized as independent
|
||||
# models, we need to use the model_tag to distinguish
|
||||
# them, e.g. backbone (default), eagle_head, etc.
|
||||
|
||||
@ -115,7 +115,7 @@ class CacheConfig:
|
||||
|
||||
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
|
||||
some layers can skip tokens corresponding to prefill. This flag enables
|
||||
attention metadata for eligible layers to be overriden with metadata
|
||||
attention metadata for eligible layers to be overridden with metadata
|
||||
necessary for implementing this optimization in some models (e.g. Gemma3n)
|
||||
"""
|
||||
|
||||
|
||||
@ -1053,7 +1053,7 @@ class EngineArgs:
|
||||
self.trust_remote_code, self.revision,
|
||||
self.code_revision, self.config_format)
|
||||
|
||||
# if loading a SpeculatorsConfig, load the specualtive_config
|
||||
# if loading a SpeculatorsConfig, load the speculative_config
|
||||
# details from the config directly
|
||||
# no user input required / expected
|
||||
if isinstance(hf_config, SpeculatorsConfig):
|
||||
|
||||
@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
# stores model placehodlers list with corresponding
|
||||
# stores model placeholders list with corresponding
|
||||
# general MM placeholder:
|
||||
# {
|
||||
# "<##IMAGE##>": ["<image>", "<image>", "<image>"],
|
||||
|
||||
@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE:
|
||||
raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
|
||||
detail="Missing 'method' in request body")
|
||||
# For security reason, only serialized string args/kwargs are passed.
|
||||
# User-defined `method` is responsible for deseralization if needed.
|
||||
# User-defined `method` is responsible for deserialization if needed.
|
||||
args: list[str] = body.get("args", [])
|
||||
kwargs: dict[str, str] = body.get("kwargs", {})
|
||||
timeout: Optional[float] = body.get("timeout")
|
||||
|
||||
@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
format = scheme_dict.get("format")
|
||||
|
||||
# Find the sparsity scheme of the layer
|
||||
# assume that fused layers inerhit first component's sparsity scheme
|
||||
# assume that fused layers inherit first component's sparsity scheme
|
||||
sparsity_targets = (self.sparsity_scheme_map.keys() -
|
||||
set(self.sparsity_ignore_list))
|
||||
sparsity_scheme: Optional[SparsityCompressionConfig] = None
|
||||
|
||||
@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
) -> "CompressedTensorsMoEMethod":
|
||||
# TODO: @dsikka: refactor this to use schemes as other kernels
|
||||
# are supported + check if the layer is being ignored.
|
||||
# Check if a using "Linear" to select scheems
|
||||
# Check if a using "Linear" to select schemes
|
||||
if "Linear" in quant_config.target_scheme_map:
|
||||
matched_target = "Linear"
|
||||
else:
|
||||
|
||||
@ -11,7 +11,7 @@ logger = init_logger(__name__)
|
||||
|
||||
class CudagraphDispatcher:
|
||||
"""
|
||||
Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
|
||||
Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs.
|
||||
|
||||
The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
|
||||
for FULL cudagraph runtime mode. The keys are initialized depending on
|
||||
@ -21,7 +21,7 @@ class CudagraphDispatcher:
|
||||
|
||||
At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
|
||||
PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
|
||||
based on the input key. After dispatching (commuicate via forward context),
|
||||
based on the input key. After dispatching (communicate via forward context),
|
||||
the cudagraph wrappers will trust the dispatch key to do either capturing
|
||||
or replaying (if mode matched), or pass through to the underlying runnable
|
||||
without cudagraph (if mode no match or mode is NONE).
|
||||
|
||||
@ -110,7 +110,7 @@ class BlockTable:
|
||||
self.block_table_cpu.fill_(0)
|
||||
|
||||
def get_device_tensor(self) -> torch.Tensor:
|
||||
"""Ruturns the device tensor of the block table."""
|
||||
"""Returns the device tensor of the block table."""
|
||||
return self.block_table
|
||||
|
||||
def get_cpu_tensor(self) -> torch.Tensor:
|
||||
|
||||
@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner):
|
||||
Args:
|
||||
scheduler_output: The scheduler output.
|
||||
"""
|
||||
# Attention free models have zero kv_cache_goups, however models
|
||||
# Attention free models have zero kv_cache_groups, however models
|
||||
# like Mamba are also attention free but use the kv_cache for
|
||||
# keeping its internal state. This is why we check the number
|
||||
# of kv_cache groups instead of solely checking
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user