mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-07 17:15:42 +08:00
[Doc]: fix typos in Python comments (#24093)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
c4ed78b14f
commit
d7e1e59972
@ -641,7 +641,7 @@ def test_schedule_decode_blocks_to_copy_update():
|
|||||||
# Nothing is preempted.
|
# Nothing is preempted.
|
||||||
assert output.blocks_to_swap_out == []
|
assert output.blocks_to_swap_out == []
|
||||||
# Since append_slot returns the source -> dist mapping, it should
|
# Since append_slot returns the source -> dist mapping, it should
|
||||||
# applied.
|
# be applied.
|
||||||
assert output.blocks_to_copy == [(2, 3)]
|
assert output.blocks_to_copy == [(2, 3)]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -32,7 +32,7 @@ def to_bytes(y, sr):
|
|||||||
|
|
||||||
async def transcribe_audio(client, tokenizer, y, sr):
|
async def transcribe_audio(client, tokenizer, y, sr):
|
||||||
# Send loaded audio directly instead of loading from disk,
|
# Send loaded audio directly instead of loading from disk,
|
||||||
# dont account for that time though
|
# don't account for that time though
|
||||||
with to_bytes(y, sr) as f:
|
with to_bytes(y, sr) as f:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
transcription = await client.audio.transcriptions.create(
|
transcription = await client.audio.transcriptions.create(
|
||||||
|
|||||||
@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
|
|||||||
logprobs_token_ids.append(token_id)
|
logprobs_token_ids.append(token_id)
|
||||||
|
|
||||||
# When echo=True, the logprobs include both prompt and response tokens
|
# When echo=True, the logprobs include both prompt and response tokens
|
||||||
# The token_ids field should match the the suffix of response portion
|
# The token_ids field should match the suffix of response portion
|
||||||
# The prompt_token_ids should match the prompt portion
|
# The prompt_token_ids should match the prompt portion
|
||||||
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
|
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
|
||||||
response_token_ids_length = len(completion.choices[0].token_ids)
|
response_token_ids_length = len(completion.choices[0].token_ids)
|
||||||
|
|||||||
@ -313,7 +313,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
|||||||
}],
|
}],
|
||||||
)
|
)
|
||||||
|
|
||||||
# By default cache_salt in the engine prompt is not set
|
# By default, cache_salt in the engine prompt is not set
|
||||||
with suppress(Exception):
|
with suppress(Exception):
|
||||||
await serving_chat.create_chat_completion(req)
|
await serving_chat.create_chat_completion(req)
|
||||||
assert "cache_salt" not in mock_engine.generate.call_args.args[0]
|
assert "cache_salt" not in mock_engine.generate.call_args.args[0]
|
||||||
|
|||||||
@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
|
|||||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||||
|
|
||||||
# We treat N-dimensional group scaling as extended numpy-style broadcasting
|
# We treat N-dimensional group scaling as extended numpy-style broadcasting
|
||||||
# in numpy simply stretches dimensions with an extent of 1 to match the
|
# in numpy simply stretches dimensions with an extent of 1 to match
|
||||||
# the target shape by repeating the data along that dimension (broadcasting)
|
# the target shape by repeating the data along that dimension (broadcasting)
|
||||||
# , we extend these semantics to say if the extent of a dimension in the
|
# , we extend these semantics to say if the extent of a dimension in the
|
||||||
# source shape is not 1 and does not match the target shape we repeat each
|
# source shape is not 1 and does not match the target shape we repeat each
|
||||||
|
|||||||
@ -458,7 +458,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
|
|||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
|
sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
|
||||||
|
|
||||||
# Check that the world size is setup correctly
|
# Check that the world size is set up correctly
|
||||||
assert get_tensor_model_parallel_world_size() == world_size
|
assert get_tensor_model_parallel_world_size() == world_size
|
||||||
|
|
||||||
# Check that the outputs have the same shape
|
# Check that the outputs have the same shape
|
||||||
@ -642,7 +642,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
|
|||||||
rope_type="rope_3d")
|
rope_type="rope_3d")
|
||||||
sharded_output = torch.cat(sharded_output, dim=0)
|
sharded_output = torch.cat(sharded_output, dim=0)
|
||||||
|
|
||||||
# Check that the world size is setup correctly
|
# Check that the world size is set up correctly
|
||||||
assert get_tensor_model_parallel_world_size() == world_size
|
assert get_tensor_model_parallel_world_size() == world_size
|
||||||
|
|
||||||
# Compare outputs (only on rank 0)
|
# Compare outputs (only on rank 0)
|
||||||
|
|||||||
@ -83,7 +83,7 @@ def test_ngram_correctness(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Compare the outputs of a original LLM and a speculative LLM
|
Compare the outputs of an original LLM and a speculative LLM
|
||||||
should be the same when using ngram speculative decoding.
|
should be the same when using ngram speculative decoding.
|
||||||
'''
|
'''
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
|
|||||||
@ -42,7 +42,7 @@ def test_basic_lifecycle():
|
|||||||
engine_core_outputs = scheduler.update_from_output(scheduler_output,
|
engine_core_outputs = scheduler.update_from_output(scheduler_output,
|
||||||
model_runner_output)
|
model_runner_output)
|
||||||
|
|
||||||
# Ensure the request is finished after 1 tokens.
|
# Ensure the request is finished after 1 token.
|
||||||
assert request.is_finished()
|
assert request.is_finished()
|
||||||
assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
|
assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
|
||||||
output = engine_core_outputs[0].outputs[0]
|
output = engine_core_outputs[0].outputs[0]
|
||||||
@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():
|
|||||||
|
|
||||||
|
|
||||||
def test_prefix_cache_lifecycle():
|
def test_prefix_cache_lifecycle():
|
||||||
"""Test that remote decode params still works with a prefix cache hit."""
|
"""Test that remote decode params still work with a prefix cache hit."""
|
||||||
|
|
||||||
vllm_config = create_vllm_config()
|
vllm_config = create_vllm_config()
|
||||||
scheduler = create_scheduler(vllm_config)
|
scheduler = create_scheduler(vllm_config)
|
||||||
|
|||||||
@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
|
|||||||
dtype=torch.bfloat16,
|
dtype=torch.bfloat16,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Setup the block table and KV cache for paged KV.
|
# Set up the block table and KV cache for paged KV.
|
||||||
assert max_sequence_length % block_size == 0
|
assert max_sequence_length % block_size == 0
|
||||||
max_blocks_per_batch = max_sequence_length // block_size
|
max_blocks_per_batch = max_sequence_length // block_size
|
||||||
kv_cache = torch.randn(
|
kv_cache = torch.randn(
|
||||||
@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
|
|||||||
num_alloc_blocks_per_batch] = block_ids.view(
|
num_alloc_blocks_per_batch] = block_ids.view(
|
||||||
-1, num_alloc_blocks_per_batch)
|
-1, num_alloc_blocks_per_batch)
|
||||||
|
|
||||||
# Setup the slot mapping for the input KVs.
|
# Set up the slot mapping for the input KVs.
|
||||||
tree_positions = sequence_position + torch.arange(
|
tree_positions = sequence_position + torch.arange(
|
||||||
0,
|
0,
|
||||||
tree_size_q,
|
tree_size_q,
|
||||||
|
|||||||
@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str:
|
|||||||
except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
|
except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
|
||||||
HFValidationError):
|
HFValidationError):
|
||||||
# Handle errors that may occur during the download
|
# Handle errors that may occur during the download
|
||||||
# Return original path instead instead of throwing error here
|
# Return original path instead of throwing error here
|
||||||
logger.exception("Error downloading the HuggingFace model")
|
logger.exception("Error downloading the HuggingFace model")
|
||||||
return lora_path
|
return lora_path
|
||||||
|
|
||||||
|
|||||||
@ -94,7 +94,7 @@ def find_matched_target(
|
|||||||
config that a layer corresponds to.
|
config that a layer corresponds to.
|
||||||
|
|
||||||
Recall that a compressed-tensors configs has a concept of
|
Recall that a compressed-tensors configs has a concept of
|
||||||
config_groups, where each layer can be quantized with with a different
|
config_groups, where each layer can be quantized with a different
|
||||||
scheme.
|
scheme.
|
||||||
|
|
||||||
targets in each config_group will be a list of either layer names
|
targets in each config_group will be a list of either layer names
|
||||||
|
|||||||
@ -213,7 +213,7 @@ class MediaConnector:
|
|||||||
image_mode: str = "RGB",
|
image_mode: str = "RGB",
|
||||||
) -> Image.Image:
|
) -> Image.Image:
|
||||||
"""
|
"""
|
||||||
Load a PIL image from a HTTP or base64 data URL.
|
Load a PIL image from an HTTP or base64 data URL.
|
||||||
|
|
||||||
By default, the image is converted into RGB format.
|
By default, the image is converted into RGB format.
|
||||||
"""
|
"""
|
||||||
@ -237,7 +237,7 @@ class MediaConnector:
|
|||||||
image_mode: str = "RGB",
|
image_mode: str = "RGB",
|
||||||
) -> Image.Image:
|
) -> Image.Image:
|
||||||
"""
|
"""
|
||||||
Asynchronously load a PIL image from a HTTP or base64 data URL.
|
Asynchronously load a PIL image from an HTTP or base64 data URL.
|
||||||
|
|
||||||
By default, the image is converted into RGB format.
|
By default, the image is converted into RGB format.
|
||||||
"""
|
"""
|
||||||
@ -261,7 +261,7 @@ class MediaConnector:
|
|||||||
image_mode: str = "RGB",
|
image_mode: str = "RGB",
|
||||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Load video from a HTTP or base64 data URL.
|
Load video from an HTTP or base64 data URL.
|
||||||
"""
|
"""
|
||||||
image_io = ImageMediaIO(image_mode=image_mode,
|
image_io = ImageMediaIO(image_mode=image_mode,
|
||||||
**self.media_io_kwargs.get("image", {}))
|
**self.media_io_kwargs.get("image", {}))
|
||||||
@ -281,7 +281,7 @@ class MediaConnector:
|
|||||||
image_mode: str = "RGB",
|
image_mode: str = "RGB",
|
||||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Asynchronously load video from a HTTP or base64 data URL.
|
Asynchronously load video from an HTTP or base64 data URL.
|
||||||
|
|
||||||
By default, the image is converted into RGB format.
|
By default, the image is converted into RGB format.
|
||||||
"""
|
"""
|
||||||
@ -370,7 +370,7 @@ def group_mm_inputs_by_modality(
|
|||||||
|
|
||||||
def modality_group_func(
|
def modality_group_func(
|
||||||
mm_input: MultiModalKwargsItems) -> Union[str, int]:
|
mm_input: MultiModalKwargsItems) -> Union[str, int]:
|
||||||
# If the input has multiple modalities, return a id as the unique key
|
# If the input has multiple modalities, return an id as the unique key
|
||||||
# for the mm_input input.
|
# for the mm_input input.
|
||||||
if len(mm_input) > 1:
|
if len(mm_input) > 1:
|
||||||
return id(mm_input)
|
return id(mm_input)
|
||||||
|
|||||||
@ -709,7 +709,7 @@ def reorder_batch_to_split_decodes_and_prefills(
|
|||||||
|
|
||||||
for i, req_id in enumerate(input_batch.req_ids):
|
for i, req_id in enumerate(input_batch.req_ids):
|
||||||
num_tokens = scheduler_output.num_scheduled_tokens[req_id]
|
num_tokens = scheduler_output.num_scheduled_tokens[req_id]
|
||||||
# for now treat 1 scheduled token as "decode" even if its not,
|
# for now treat 1 scheduled token as "decode" even if it's not,
|
||||||
# we should update this to something like < 8 in the future but
|
# we should update this to something like < 8 in the future but
|
||||||
# currently the TritonMLA._forward_decode only supports
|
# currently the TritonMLA._forward_decode only supports
|
||||||
# num_tokens = 1
|
# num_tokens = 1
|
||||||
|
|||||||
@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str:
|
|||||||
elif xdg_cache_home:
|
elif xdg_cache_home:
|
||||||
return os.path.join(xdg_cache_home, ".cache", "outlines")
|
return os.path.join(xdg_cache_home, ".cache", "outlines")
|
||||||
# If homedir is "/", we may be inside a container, and thus writing to
|
# If homedir is "/", we may be inside a container, and thus writing to
|
||||||
# root would be problematic, so we fallback to using a tempfile.
|
# root would be problematic, so we fall back to using a tempfile.
|
||||||
# Also validate the path exists, since os.path.expanduser does
|
# Also validate the path exists, since os.path.expanduser does
|
||||||
# not garuntee existence.
|
# not guarantee existence.
|
||||||
elif os.path.isdir(home_dir) and home_dir != "/":
|
elif os.path.isdir(home_dir) and home_dir != "/":
|
||||||
# Default Unix fallback: ~/.cache/outlines
|
# Default Unix fallback: ~/.cache/outlines
|
||||||
return os.path.join(home_dir, ".cache", "outlines")
|
return os.path.join(home_dir, ".cache", "outlines")
|
||||||
|
|||||||
@ -250,7 +250,7 @@ class TPUWorker:
|
|||||||
scheduler_output: "SchedulerOutput",
|
scheduler_output: "SchedulerOutput",
|
||||||
) -> Optional[ModelRunnerOutput]:
|
) -> Optional[ModelRunnerOutput]:
|
||||||
output = self.model_runner.execute_model(scheduler_output)
|
output = self.model_runner.execute_model(scheduler_output)
|
||||||
# every worker's output is needed when kv_transfer_group is setup
|
# every worker's output is needed when kv_transfer_group is set up
|
||||||
return output if self.is_driver_worker or has_kv_transfer_group(
|
return output if self.is_driver_worker or has_kv_transfer_group(
|
||||||
) else None
|
) else None
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user