mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 13:57:12 +08:00
[Doc]: fix typos in Python comments (#24077)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
56d04089ef
commit
fad73be1a5
@ -98,7 +98,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
|
||||
pool.join()
|
||||
|
||||
# check cancellation stats
|
||||
# give it some times to update the stats
|
||||
# give it some time to update the stats
|
||||
time.sleep(1)
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
|
||||
@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
|
||||
test_llm_generator):
|
||||
"""Verify block manager v2 with auto prefix caching could works normal
|
||||
"""Verify block manager v2 with auto prefix caching could work normally
|
||||
even when eviction started.
|
||||
With APC enabled, all blocks are held by native block at the beginning.
|
||||
Then blocks are managed by evictor instead. If cache hit at the evitor's
|
||||
Then blocks are managed by evictor instead. If cache hit at the evictor's
|
||||
block, then it could be reused, or we need to recompute its kv cache.
|
||||
"""
|
||||
output_len = 10
|
||||
|
||||
@ -167,7 +167,7 @@ def test_get_kwargs():
|
||||
# dict should have json tip in help
|
||||
json_tip = "Should either be a valid JSON string or JSON keys"
|
||||
assert json_tip in kwargs["json_tip"]["help"]
|
||||
# nested config should should construct the nested config
|
||||
# nested config should construct the nested config
|
||||
assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
|
||||
|
||||
|
||||
|
||||
@ -282,7 +282,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
|
||||
a1_scale=a1_scale,
|
||||
block_shape=block_shape,
|
||||
# Make sure this is set to False so we
|
||||
# dont end up comparing the same implementation.
|
||||
# don't end up comparing the same implementation.
|
||||
allow_deep_gemm=False)
|
||||
|
||||
|
||||
|
||||
@ -59,10 +59,10 @@ async def requests_processing_time(llm,
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_lora(chatglm3_lora_files):
|
||||
"""
|
||||
The add_lora function is used to pre-load some LoRA adapters into the
|
||||
The add_lora function is used to preload some LoRA adapters into the
|
||||
engine in anticipation of future requests using these adapters. To test
|
||||
this functionality, we use the async engine to process some requests - We
|
||||
do it twice, once with add_lora() pre-loading and once without.
|
||||
do it twice, once with add_lora() preloading and once without.
|
||||
|
||||
We measure the request processing time in both cases and expect the time
|
||||
to be lesser in the case with add_lora() calls.
|
||||
|
||||
@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
|
||||
adapters that define additional tokens.
|
||||
"""
|
||||
|
||||
# Setup a base model compatible with the sql_lora_files adapter and
|
||||
# Set up a base model compatible with the sql_lora_files adapter and
|
||||
# a known number of tokens in the base model.
|
||||
model_config = ModelConfig(
|
||||
model=llama_2_7b_base_huggingface_id,
|
||||
@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
|
||||
adapters that do not define additional tokens.
|
||||
"""
|
||||
|
||||
# Setup a base model compatible with the qwen25vl_lora_files adapter and
|
||||
# Set up a base model compatible with the qwen25vl_lora_files adapter and
|
||||
# a known number of tokens in the base model.
|
||||
model_config = ModelConfig(
|
||||
model=qwen25vl_base_huggingface_id,
|
||||
|
||||
@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# These have unsupported head_dim for FA. We do not
|
||||
# not have a clean way to fall back, so we fail with
|
||||
# have a clean way to fall back, so we fail with
|
||||
# a clear msg when it happens.
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
|
||||
@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
# uses the v3-Tekken tokenizer
|
||||
"mistralai/Ministral-8B-Instruct-2410",
|
||||
# Mistral-Nemo is to big for CI, but passes locally
|
||||
# Mistral-Nemo is too big for CI, but passes locally
|
||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
]
|
||||
|
||||
@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
||||
|
||||
|
||||
def test_mistral_function_call_nested_json():
|
||||
"""Ensure that the function-name regex captures the entire outer-most
|
||||
"""Ensure that the function-name regex captures the entire outermost
|
||||
JSON block, including nested braces."""
|
||||
|
||||
# Create a minimal stub tokenizer that provides the few attributes the
|
||||
|
||||
@ -154,7 +154,7 @@ def batch_make_image_embeddings(
|
||||
embed_counter += cur_batch_embed_len
|
||||
image_counter += cur_batch_image_count
|
||||
|
||||
# ensure we don't lost any images or embeddings
|
||||
# ensure we don't lose any images or embeddings
|
||||
assert embed_counter == image_embeds.size(0)
|
||||
assert image_counter == image_grid_thw.size(0)
|
||||
assert len(image_batches) == len(result)
|
||||
@ -238,7 +238,7 @@ def batch_make_video_embeddings(
|
||||
embed_counter += cur_batch_embed_len
|
||||
video_counter += cur_batch_video_count
|
||||
|
||||
# ensure we don't lost any videos or embeddings
|
||||
# ensure we don't lose any videos or embeddings
|
||||
assert embed_counter == video_embeds.size(0)
|
||||
assert video_counter == video_grid_thw.size(0)
|
||||
assert len(video_batches) == len(result)
|
||||
|
||||
@ -247,7 +247,7 @@ def test_free_kv_cache_block_queue_append_n():
|
||||
|
||||
def test_free_kv_cache_block_queue_popleft_n():
|
||||
blocks = [KVCacheBlock(block_id=i) for i in range(6)]
|
||||
# Create a empty FreeKVCacheBlockQueue with these blocks
|
||||
# Create an empty FreeKVCacheBlockQueue with these blocks
|
||||
queue = FreeKVCacheBlockQueue(
|
||||
[blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]])
|
||||
assert queue.num_free_blocks == 6
|
||||
|
||||
@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
|
||||
kwargs: Optional[dict] = None,
|
||||
non_block: bool = False,
|
||||
unique_reply_rank: Optional[int] = None) -> list[Any]:
|
||||
# Drop marker to show that this was ran
|
||||
# Drop marker to show that this was run
|
||||
with open(".marker", "w"):
|
||||
...
|
||||
return super().collective_rpc(method, timeout, args, kwargs)
|
||||
|
||||
@ -183,7 +183,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
|
||||
mock_pp_group.world_size = pp_size
|
||||
mock_get_pp_group.return_value = mock_pp_group
|
||||
|
||||
# Setup the target model mock with a custom class so that
|
||||
# Set up the target model mock with a custom class so that
|
||||
# isinstance() checks match the expected type.
|
||||
class _TargetModelStub(LlamaForCausalLM):
|
||||
model: mock.MagicMock
|
||||
|
||||
@ -30,7 +30,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
|
||||
}
|
||||
|
||||
# Layers 0 and 1 both belong in KV cache group 0
|
||||
# However, if they have have different attention backends, they will be
|
||||
# However, if they have different attention backends, they will be
|
||||
# placed in different attention groups for KV cache group 0
|
||||
kv_cache_groups = [
|
||||
KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
|
||||
|
||||
@ -702,7 +702,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
|
||||
KVCacheTensors for the attention and mamba layers
|
||||
(via _reshape_kv_cache_tensors function). This test verifies
|
||||
that the views are compatible: writing a mamba block
|
||||
will not corrupt an attention block and vice-versa
|
||||
will not corrupt an attention block and vice versa
|
||||
'''
|
||||
|
||||
current_platform.seed_everything(42)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user