mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 14:25:46 +08:00
[Doc]: fix typos in Python comments (#24173)
Signed-off-by: Didier Durand <durand.didier@gmail.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
parent
e41a0fa377
commit
83609ca91d
@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
|
|||||||
# [6880, 6881] -> ['Ġcalls', 'here'] ->
|
# [6880, 6881] -> ['Ġcalls', 'here'] ->
|
||||||
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
||||||
# To avoid uncontrolled change of the prompt length,
|
# To avoid uncontrolled change of the prompt length,
|
||||||
# the encoded sequence is truncated before being decode again.
|
# the encoded sequence is truncated before being decoded again.
|
||||||
total_input_len = prefix_len + int(input_lens[i])
|
total_input_len = prefix_len + int(input_lens[i])
|
||||||
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
|
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
|
||||||
:total_input_len
|
:total_input_len
|
||||||
|
|||||||
@ -637,7 +637,7 @@ def bench_optype(
|
|||||||
# Clear LoRA optimization hash-maps.
|
# Clear LoRA optimization hash-maps.
|
||||||
_LORA_A_PTR_DICT.clear()
|
_LORA_A_PTR_DICT.clear()
|
||||||
_LORA_B_PTR_DICT.clear()
|
_LORA_B_PTR_DICT.clear()
|
||||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
|
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
op_type.bench_fn()(**kwargs)
|
op_type.bench_fn()(**kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|||||||
@ -962,7 +962,7 @@ async def main_mp(
|
|||||||
|
|
||||||
# At this point all the clients finished,
|
# At this point all the clients finished,
|
||||||
# collect results (TTFT, TPOT, etc.) from all the clients.
|
# collect results (TTFT, TPOT, etc.) from all the clients.
|
||||||
# This needs to happens before calling join on the clients
|
# This needs to happen before calling join on the clients
|
||||||
# (result_queue should be emptied).
|
# (result_queue should be emptied).
|
||||||
while not result_queue.empty():
|
while not result_queue.empty():
|
||||||
client_metrics.append(result_queue.get())
|
client_metrics.append(result_queue.get())
|
||||||
|
|||||||
@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
|
|||||||
|
|
||||||
# Granite Speech
|
# Granite Speech
|
||||||
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
||||||
# NOTE - the setting in this example are somehat different than what is
|
# NOTE - the setting in this example are somewhat different from what is
|
||||||
# optimal for granite speech, and it is generally recommended to use beam
|
# optimal for granite speech, and it is generally recommended to use beam
|
||||||
# search. Check the model README for suggested settings.
|
# search. Check the model README for suggested settings.
|
||||||
# https://huggingface.co/ibm-granite/granite-speech-3.3-8b
|
# https://huggingface.co/ibm-granite/granite-speech-3.3-8b
|
||||||
|
|||||||
@ -250,7 +250,7 @@ def build_video_inputs_from_test_info(
|
|||||||
|
|
||||||
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
|
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
|
||||||
size_type: SizeType):
|
size_type: SizeType):
|
||||||
"""Applies a size scaler to one image; this can be a an image size factor,
|
"""Applies a size scaler to one image; this can be an image size factor,
|
||||||
which scales the image while maintaining the aspect ratio"""
|
which scales the image while maintaining the aspect ratio"""
|
||||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||||
# are considering size factors at constant scale, i.e., we just clone
|
# are considering size factors at constant scale, i.e., we just clone
|
||||||
|
|||||||
@ -42,7 +42,7 @@ def get_filtered_test_settings(
|
|||||||
else:
|
else:
|
||||||
assert test_info.prompt_formatter is not None
|
assert test_info.prompt_formatter is not None
|
||||||
|
|
||||||
# Everything looks okay; keep if this is has correct proc handling
|
# Everything looks okay; keep if this is correct proc handling
|
||||||
if (test_info.distributed_executor_backend
|
if (test_info.distributed_executor_backend
|
||||||
is not None) == new_proc_per_test:
|
is not None) == new_proc_per_test:
|
||||||
matching_tests[test_name] = test_info
|
matching_tests[test_name] = test_info
|
||||||
|
|||||||
@ -822,7 +822,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
|
|||||||
and context_lens_tensor is not None \
|
and context_lens_tensor is not None \
|
||||||
and context_lens_tensor[:self.num_prefills].max() > 0:
|
and context_lens_tensor[:self.num_prefills].max() > 0:
|
||||||
|
|
||||||
# NOTE: it is recommend you read the `Chunked Prefill` section in
|
# NOTE: it is recommended you read the `Chunked Prefill` section in
|
||||||
# the comment at the top of the file before trying to understand
|
# the comment at the top of the file before trying to understand
|
||||||
# the following code
|
# the following code
|
||||||
|
|
||||||
|
|||||||
@ -717,7 +717,7 @@ class AsyncLLMEngine(EngineClient):
|
|||||||
# Stop the execute model loop in parallel workers until there
|
# Stop the execute model loop in parallel workers until there
|
||||||
# are more requests to process. This avoids waiting
|
# are more requests to process. This avoids waiting
|
||||||
# indefinitely in torch.distributed ops which may otherwise
|
# indefinitely in torch.distributed ops which may otherwise
|
||||||
# timeout, and unblocks the RPC thread in the workers so that
|
# time out, and unblocks the RPC thread in the workers so that
|
||||||
# they can process any other queued control plane messages,
|
# they can process any other queued control plane messages,
|
||||||
# such as add/remove lora adapters.
|
# such as add/remove lora adapters.
|
||||||
await engine.engine.stop_remote_worker_execution_loop_async()
|
await engine.engine.stop_remote_worker_execution_loop_async()
|
||||||
|
|||||||
@ -270,7 +270,7 @@ class MQLLMEngineClient(EngineClient):
|
|||||||
queue.put_nowait(request_output)
|
queue.put_nowait(request_output)
|
||||||
|
|
||||||
async def setup(self):
|
async def setup(self):
|
||||||
"""Setup the client before it starts sending server requests."""
|
"""Set up the client before it starts sending server requests."""
|
||||||
|
|
||||||
# Start output_loop
|
# Start output_loop
|
||||||
if self.output_loop is None:
|
if self.output_loop is None:
|
||||||
|
|||||||
@ -19,7 +19,7 @@ def awq_dequantize_kernel(
|
|||||||
num_rows, # input num rows in qweight
|
num_rows, # input num rows in qweight
|
||||||
BLOCK_SIZE_X: tl.constexpr,
|
BLOCK_SIZE_X: tl.constexpr,
|
||||||
BLOCK_SIZE_Y: tl.constexpr):
|
BLOCK_SIZE_Y: tl.constexpr):
|
||||||
# Setup the pids.
|
# Set up the pids.
|
||||||
pid_x = tl.program_id(axis=0)
|
pid_x = tl.program_id(axis=0)
|
||||||
pid_y = tl.program_id(axis=1)
|
pid_y = tl.program_id(axis=1)
|
||||||
|
|
||||||
|
|||||||
@ -128,7 +128,7 @@ class QuantizationConfig(ABC):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def get_from_keys_or(config: dict[str, Any], keys: list[str],
|
def get_from_keys_or(config: dict[str, Any], keys: list[str],
|
||||||
default: Any) -> Any:
|
default: Any) -> Any:
|
||||||
"""Get a optional value from the model's quantization config."""
|
"""Get an optional value from the model's quantization config."""
|
||||||
try:
|
try:
|
||||||
return QuantizationConfig.get_from_keys(config, keys)
|
return QuantizationConfig.get_from_keys(config, keys)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
|||||||
@ -401,7 +401,7 @@ M = TypeVar("M", bound=MLACommonMetadata)
|
|||||||
|
|
||||||
|
|
||||||
def use_flashinfer_prefill() -> bool:
|
def use_flashinfer_prefill() -> bool:
|
||||||
# For blackwell default to flashinfer prefill if its available since
|
# For blackwell default to flashinfer prefill if it's available since
|
||||||
# it is faster than FA2.
|
# it is faster than FA2.
|
||||||
return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
|
return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
|
||||||
and current_platform.is_device_capability(100))
|
and current_platform.is_device_capability(100))
|
||||||
@ -1018,7 +1018,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
|
|||||||
return layer.weight
|
return layer.weight
|
||||||
|
|
||||||
# we currently do not have quantized bmm's which are needed for
|
# we currently do not have quantized bmm's which are needed for
|
||||||
# `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
|
# `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
|
||||||
# the bmm's in 16-bit, the extra memory overhead of this is fairly low
|
# the bmm's in 16-bit, the extra memory overhead of this is fairly low
|
||||||
kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
|
kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
|
||||||
assert kv_b_proj_weight.shape == (
|
assert kv_b_proj_weight.shape == (
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user