mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-05 23:27:15 +08:00
[Doc]: fix typos in Python comments (#24294)
Signed-off-by: Didier Durand <durand.didier@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
parent
35efa70297
commit
35bf193864
@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
||||
))
|
||||
|
||||
def prepacked_type_key(prepack_type: PrepackTypeConfig):
|
||||
# For now we we can just use the first accumulator type seen since
|
||||
# For now, we can just use the first accumulator type seen since
|
||||
# the tensor core shapes/layouts don't vary based on accumulator
|
||||
# type so we can generate less code this way
|
||||
return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
|
||||
|
||||
@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
|
||||
- Offline Inference: `256 * world_size`
|
||||
- Online Serving: `128 * world_size`
|
||||
|
||||
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
|
||||
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
|
||||
|
||||
### Which quantization configs does vLLM CPU support?
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ def run_test(
|
||||
tensor_parallel_size: int = 1,
|
||||
vllm_embeddings: Optional[torch.Tensor] = None,
|
||||
):
|
||||
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
|
||||
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
|
||||
# In the case of embeddings, vLLM takes separate input tensors
|
||||
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ class CustomAllreduce:
|
||||
group: the process group to work on. If None, it will use the
|
||||
default process group.
|
||||
device: the device to bind the CustomAllreduce to. If None,
|
||||
it will be bind to f"cuda:{local_rank}".
|
||||
it will be bound to f"cuda:{local_rank}".
|
||||
It is the caller's responsibility to make sure each communicator
|
||||
is bind to a unique device, and all communicators in this group
|
||||
are in the same node.
|
||||
@ -158,7 +158,7 @@ class CustomAllreduce:
|
||||
|
||||
self.disabled = False
|
||||
# Buffers memory are owned by this Python class and passed to C++.
|
||||
# Meta data composes of two parts: meta data for synchronization and a
|
||||
# Metadata composes of two parts: metadata for synchronization and a
|
||||
# temporary buffer for storing intermediate allreduce results.
|
||||
self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
|
||||
group=group,
|
||||
|
||||
@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
|
||||
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||
if request.tools and request.tool_choice != 'none':
|
||||
# do not skip special tokens because internlm use the special
|
||||
# tokens to indicated the start and end of the tool calls
|
||||
# tokens to indicate the start and end of the tool calls
|
||||
# information.
|
||||
request.skip_special_tokens = False
|
||||
return request
|
||||
@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
|
||||
if '<|action_start|>' not in current_text:
|
||||
self.position = len(current_text)
|
||||
return DeltaMessage(content=delta_text)
|
||||
# if the tool call is sended, return a empty delta message
|
||||
# to make sure the finish_reason will be send correctly.
|
||||
# if the tool call is sended, return an empty delta message
|
||||
# to make sure the finish_reason will be sent correctly.
|
||||
if self.current_tool_id > 0:
|
||||
return DeltaMessage(content='')
|
||||
|
||||
|
||||
@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# vllm should use flashinfer fused allreduce. The variable should be a
|
||||
# JSON with the following format:
|
||||
# { <world size>: <max size in mb> }
|
||||
# Unspecified world sizes will fallback to
|
||||
# Unspecified world sizes will fall back to
|
||||
# { 2: 64, 4: 1, <everything else>: 0.5 }
|
||||
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
|
||||
lambda: json.loads(os.getenv(
|
||||
|
||||
@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
|
||||
EM = sorted_token_ids.size(0)
|
||||
if A.size(0) < config["BLOCK_SIZE_M"]:
|
||||
# optimize for small batch_size.
|
||||
# We assume that top_ids of each token is unique, so
|
||||
# We assume that top_ids of each token is unique,
|
||||
# so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
|
||||
# and we can skip some invalid blocks.
|
||||
EM = min(sorted_token_ids.size(0),
|
||||
|
||||
@ -710,7 +710,7 @@ def determine_expert_map(
|
||||
|
||||
# Create a tensor of size num_experts filled with -1
|
||||
expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
|
||||
# Create a expert map for the local experts
|
||||
# Create an expert map for the local experts
|
||||
start_idx = ep_rank * base_experts + min(ep_rank, remainder)
|
||||
expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
|
||||
0, local_num_experts, dtype=torch.int32)
|
||||
@ -806,7 +806,7 @@ class FusedMoE(CustomOp):
|
||||
|
||||
self.global_num_experts = num_experts + num_redundant_experts
|
||||
|
||||
# we padding globally so EP buffer allocation works
|
||||
# we are padding globally so EP buffer allocation works
|
||||
if quant_config and quant_config.get_name() == "mxfp4":
|
||||
from vllm.model_executor.layers.quantization.mxfp4 import ( # noqa: E501
|
||||
should_use_flashinfer_mxfp4)
|
||||
|
||||
@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
layer.register_parameter("w2_scales", w2_scales)
|
||||
set_weight_attrs(w2_scales, extra_weight_attrs)
|
||||
# dont shard the w2 scales when running act order
|
||||
# don't shard the w2 scales when running act order
|
||||
set_weight_attrs(w2_scales,
|
||||
{"load_full_w2": self.quant_config.desc_act})
|
||||
# up_proj scales
|
||||
@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
layer.register_parameter("w2_qzeros", w2_qzeros)
|
||||
set_weight_attrs(w2_qzeros, extra_weight_attrs)
|
||||
# dont shard the w2 scales when running act order
|
||||
# don't shard the w2 scales when running act order
|
||||
set_weight_attrs(w2_qzeros,
|
||||
{"load_full_w2": self.quant_config.desc_act})
|
||||
w13_g_idx = torch.nn.Parameter(
|
||||
|
||||
@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl):
|
||||
else:
|
||||
raise ValueError(f"Unsupported output dtype: {output.dtype}")
|
||||
|
||||
# TRTLLM attn kernel requires o scale to pass as a host scalar,
|
||||
# TRTLLM attn kernel requires to scale to pass as a host scalar,
|
||||
# store the o scale as a host scalar in warmup run with cuda graph
|
||||
# not enabled
|
||||
if layer._o_scale_float is None:
|
||||
|
||||
@ -439,7 +439,7 @@ class EngineCore:
|
||||
"""
|
||||
# Note on thread safety: no race condition.
|
||||
# `mm_receiver_cache` is reset at the end of LLMEngine init,
|
||||
# and will only accessed in the input processing thread afterwards.
|
||||
# and will only be accessed in the input processing thread afterwards.
|
||||
if self.mm_receiver_cache is not None and request.mm_features:
|
||||
request.mm_features = (
|
||||
self.mm_receiver_cache.get_and_update_features(
|
||||
|
||||
@ -2826,7 +2826,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# Disable cudagraph capturing globally, so any unexpected cudagraph
|
||||
# capturing will be detected and raise an error after here.
|
||||
# Note: We don't put it into graph_capture context manager because
|
||||
# we may doing lazy capturing in future that still allows capturing
|
||||
# we may do lazy capturing in future that still allows capturing
|
||||
# after here.
|
||||
set_cudagraph_capturing_enabled(False)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user