mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 03:44:56 +08:00
[Doc]: fix typos in Python comments (#24417)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
2f0b833a05
commit
f4962a6d55
@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)
|
|||||||
|
|
||||||
print(outputs[0].outputs[0].text.strip())
|
print(outputs[0].outputs[0].text.strip())
|
||||||
# yields
|
# yields
|
||||||
# 'The weather in Dallas, TX is 85 degrees fahrenheit. '
|
# 'The weather in Dallas, TX is 85 degrees Fahrenheit. '
|
||||||
# 'It is partly cloudly, with highs in the 90's.'
|
# 'It is partly cloudly, with highs in the 90's.'
|
||||||
|
|||||||
@ -1052,7 +1052,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
|
|||||||
return layer.weight
|
return layer.weight
|
||||||
|
|
||||||
# we currently do not have quantized bmm's which are needed for
|
# we currently do not have quantized bmm's which are needed for
|
||||||
# `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
|
# `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
|
||||||
# the bmm's in 16-bit, the extra memory overhead of this is fairly low
|
# the bmm's in 16-bit, the extra memory overhead of this is fairly low
|
||||||
kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
|
kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
|
||||||
assert kv_b_proj_weight.shape == (
|
assert kv_b_proj_weight.shape == (
|
||||||
|
|||||||
@ -1169,7 +1169,7 @@ class ModelConfig:
|
|||||||
]
|
]
|
||||||
# Any custom overrides will be in quantization_methods so we place
|
# Any custom overrides will be in quantization_methods so we place
|
||||||
# them at the start of the list so custom overrides have preference
|
# them at the start of the list so custom overrides have preference
|
||||||
# over the built in ones.
|
# over the built-in ones.
|
||||||
quantization_methods = quantization_methods + overrides
|
quantization_methods = quantization_methods + overrides
|
||||||
|
|
||||||
# Detect which checkpoint is it
|
# Detect which checkpoint is it
|
||||||
|
|||||||
@ -770,7 +770,7 @@ class NixlConnectorWorker:
|
|||||||
# with joint KV for each block. This minimizes the overhead in
|
# with joint KV for each block. This minimizes the overhead in
|
||||||
# registerMem allowing faster descs queries. In order to be able to
|
# registerMem allowing faster descs queries. In order to be able to
|
||||||
# split on kv_heads dim as required by heterogeneous TP, one must
|
# split on kv_heads dim as required by heterogeneous TP, one must
|
||||||
# be able to index K/V separately. Hence the we double the number
|
# be able to index K/V separately. Hence we double the number
|
||||||
# of 'virtual' regions here and halve `block_len` below.
|
# of 'virtual' regions here and halve `block_len` below.
|
||||||
self.num_regions *= 2
|
self.num_regions *= 2
|
||||||
|
|
||||||
|
|||||||
@ -1159,7 +1159,7 @@ class EngineArgs:
|
|||||||
# Note(hc): In the current implementation of decode context
|
# Note(hc): In the current implementation of decode context
|
||||||
# parallel(DCP), tp_size needs to be divisible by dcp_size,
|
# parallel(DCP), tp_size needs to be divisible by dcp_size,
|
||||||
# because the world size does not change by dcp, it simply
|
# because the world size does not change by dcp, it simply
|
||||||
# reuse the GPUs of TP group, and split one TP group into
|
# reuses the GPUs of TP group, and split one TP group into
|
||||||
# tp_size//dcp_size DCP groups.
|
# tp_size//dcp_size DCP groups.
|
||||||
assert self.tensor_parallel_size % self.decode_context_parallel_size \
|
assert self.tensor_parallel_size % self.decode_context_parallel_size \
|
||||||
== 0, (
|
== 0, (
|
||||||
|
|||||||
@ -235,7 +235,7 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
# therefore we have to inform that the current
|
# therefore we have to inform that the current
|
||||||
# processed requests failed as well. Send back a dead
|
# processed requests failed as well. Send back a dead
|
||||||
# engine error give this feedback and also give a
|
# engine error give this feedback and also give a
|
||||||
# 'hint' to the server to shutdown next.
|
# 'hint' to the server to shut down next.
|
||||||
exception = self.dead_error
|
exception = self.dead_error
|
||||||
|
|
||||||
if request_id is None:
|
if request_id is None:
|
||||||
|
|||||||
@ -204,7 +204,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
|||||||
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
|
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
|
||||||
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
|
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
|
||||||
|
|
||||||
# Special case: Middleware needs append action
|
# Special case: Middleware needs to append action
|
||||||
frontend_kwargs["middleware"]["action"] = "append"
|
frontend_kwargs["middleware"]["action"] = "append"
|
||||||
frontend_kwargs["middleware"]["type"] = str
|
frontend_kwargs["middleware"]["type"] = str
|
||||||
if "nargs" in frontend_kwargs["middleware"]:
|
if "nargs" in frontend_kwargs["middleware"]:
|
||||||
|
|||||||
@ -176,7 +176,7 @@ class Llama4PythonicToolParser(ToolParser):
|
|||||||
index] += delta.function.arguments
|
index] += delta.function.arguments
|
||||||
|
|
||||||
# HACK: serving_chat.py inspects the internal state of tool parsers
|
# HACK: serving_chat.py inspects the internal state of tool parsers
|
||||||
# when determining it's final streaming delta, automatically
|
# when determining its final streaming delta, automatically
|
||||||
# adding autocompleted JSON.
|
# adding autocompleted JSON.
|
||||||
# These two lines avoid that nonsense while ensuring finish_reason
|
# These two lines avoid that nonsense while ensuring finish_reason
|
||||||
# is set to tool_calls when at least one tool is called.
|
# is set to tool_calls when at least one tool is called.
|
||||||
|
|||||||
@ -143,7 +143,7 @@ class MistralToolParser(ToolParser):
|
|||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# use a regex to find the part corresponding to the tool call.
|
# use a regex to find the part corresponding to the tool call.
|
||||||
# NOTE: This use case should not happen if the model is trained
|
# NOTE: This use case should not happen if the model is trained
|
||||||
# correctly. It's a easy possible fix so it's included, but
|
# correctly. It's an easy possible fix so it's included, but
|
||||||
# can be brittle for very complex / highly nested tool calls
|
# can be brittle for very complex / highly nested tool calls
|
||||||
raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
|
raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
|
||||||
function_call_arr = json.loads(raw_tool_call)
|
function_call_arr = json.loads(raw_tool_call)
|
||||||
|
|||||||
@ -302,7 +302,7 @@ class FusedMoEPrepareAndFinalize(ABC):
|
|||||||
def max_num_tokens_per_rank(self) -> Optional[int]:
|
def max_num_tokens_per_rank(self) -> Optional[int]:
|
||||||
"""
|
"""
|
||||||
Some PrepareFinalize All2All implementations are batched. Meaning,
|
Some PrepareFinalize All2All implementations are batched. Meaning,
|
||||||
they can processes only as set of tokens at a time. This
|
they can process only as set of tokens at a time. This
|
||||||
function returns the batch size i.e the maximum number of tokens
|
function returns the batch size i.e the maximum number of tokens
|
||||||
the implementation can process at a time.
|
the implementation can process at a time.
|
||||||
Return None if there are no such restrictions.
|
Return None if there are no such restrictions.
|
||||||
|
|||||||
@ -201,7 +201,7 @@ def marlin_make_workspace(output_size_per_partition: int,
|
|||||||
def marlin_make_workspace_new(device: torch.device,
|
def marlin_make_workspace_new(device: torch.device,
|
||||||
max_blocks_per_sm: int = 1) -> torch.Tensor:
|
max_blocks_per_sm: int = 1) -> torch.Tensor:
|
||||||
# In the new marlin kernel, we use the num of threadblocks as workspace
|
# In the new marlin kernel, we use the num of threadblocks as workspace
|
||||||
# size. The num of threadblocks is is sms_count * max_blocks_per_sm.
|
# size. The num of threadblocks is sms_count * max_blocks_per_sm.
|
||||||
sms = torch.cuda.get_device_properties(device).multi_processor_count
|
sms = torch.cuda.get_device_properties(device).multi_processor_count
|
||||||
return torch.zeros(sms * max_blocks_per_sm,
|
return torch.zeros(sms * max_blocks_per_sm,
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
|
|||||||
@ -98,7 +98,7 @@ class BlockTable:
|
|||||||
# here because M (max_model_len) is not necessarily divisible by
|
# here because M (max_model_len) is not necessarily divisible by
|
||||||
# block_size.
|
# block_size.
|
||||||
if self.dcp_world_size > 1:
|
if self.dcp_world_size > 1:
|
||||||
# Note(hc): The DCP implement store kvcache with a interleave
|
# Note(hc): The DCP implement store kvcache with an interleave
|
||||||
# style, the kvcache for the token whose token_idx is i is
|
# style, the kvcache for the token whose token_idx is i is
|
||||||
# always stored on the GPU whose dcp_rank equals i % cp_world_size:
|
# always stored on the GPU whose dcp_rank equals i % cp_world_size:
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user