mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 21:15:56 +08:00
[Docs] Fix warnings in mkdocs build (continued) (#25042)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
c2fdc71c91
commit
dad5f4d16d
@ -15,7 +15,7 @@ is used by model runners to dispatch data processing according to the target
|
|||||||
model.
|
model.
|
||||||
|
|
||||||
Info:
|
Info:
|
||||||
[mm_processing](../../../design/mm_processing.html)
|
[mm_processing](../../../design/mm_processing.md)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
|||||||
@ -3216,7 +3216,7 @@ def cprofile_context(save_file: Optional[str] = None):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
save_file: path to save the profile result. "1" or
|
save_file: path to save the profile result. "1" or
|
||||||
None will result in printing to stdout.
|
None will result in printing to stdout.
|
||||||
"""
|
"""
|
||||||
import cProfile
|
import cProfile
|
||||||
|
|
||||||
@ -3273,7 +3273,7 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
|
|||||||
and getattr(cfg.attn_config, "alibi", False)))))
|
and getattr(cfg.attn_config, "alibi", False)))))
|
||||||
|
|
||||||
|
|
||||||
def sha256(input) -> bytes:
|
def sha256(input: Any) -> bytes:
|
||||||
"""Hash any picklable Python object using SHA-256.
|
"""Hash any picklable Python object using SHA-256.
|
||||||
|
|
||||||
The input is serialized using pickle before hashing, which allows
|
The input is serialized using pickle before hashing, which allows
|
||||||
@ -3290,7 +3290,7 @@ def sha256(input) -> bytes:
|
|||||||
return hashlib.sha256(input_bytes).digest()
|
return hashlib.sha256(input_bytes).digest()
|
||||||
|
|
||||||
|
|
||||||
def sha256_cbor(input) -> bytes:
|
def sha256_cbor(input: Any) -> bytes:
|
||||||
"""
|
"""
|
||||||
Hash objects using CBOR serialization and SHA-256.
|
Hash objects using CBOR serialization and SHA-256.
|
||||||
|
|
||||||
|
|||||||
@ -1230,7 +1230,7 @@ def get_kv_cache_configs(vllm_config: VllmConfig,
|
|||||||
vllm_config: The global VllmConfig
|
vllm_config: The global VllmConfig
|
||||||
kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker.
|
kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker.
|
||||||
available_memory: Memory available for KV cache in bytes for each
|
available_memory: Memory available for KV cache in bytes for each
|
||||||
worker.
|
worker.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The generated KVCacheConfigs for each worker.
|
The generated KVCacheConfigs for each worker.
|
||||||
|
|||||||
@ -351,17 +351,17 @@ def generate_uniform_probs(
|
|||||||
without a seed.
|
without a seed.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
num_tokens : int
|
num_tokens: int
|
||||||
Total number of tokens.
|
Total number of tokens.
|
||||||
num_draft_tokens : List[List[int]]
|
num_draft_tokens: List[List[int]]
|
||||||
Number of draft tokens per request.
|
Number of draft tokens per request.
|
||||||
generators : Optional[Dict[int, torch.Generator]]
|
generators: Optional[Dict[int, torch.Generator]]
|
||||||
A dictionary mapping indices in the batch to
|
A dictionary mapping indices in the batch to
|
||||||
`torch.Generator` objects.
|
`torch.Generator` objects.
|
||||||
device : torch.device
|
device: torch.device
|
||||||
The device on which to allocate the tensor.
|
The device on which to allocate the tensor.
|
||||||
Returns:
|
Returns:
|
||||||
uniform_rand : torch.Tensor
|
uniform_rand: torch.Tensor
|
||||||
A tensor of shape `(num_tokens, )` containing uniform
|
A tensor of shape `(num_tokens, )` containing uniform
|
||||||
random values in the range [0, 1).
|
random values in the range [0, 1).
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1479,7 +1479,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
scheduler_output: The scheduler output containing scheduled encoder
|
scheduler_output: The scheduler output containing scheduled encoder
|
||||||
inputs.
|
inputs.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A tuple of (mm_kwargs, req_ids_pos) where:
|
A tuple of (mm_kwargs, req_ids_pos) where:
|
||||||
|
|||||||
@ -205,7 +205,8 @@ def gather_mm_placeholders(
|
|||||||
"""
|
"""
|
||||||
Reconstructs the embeddings from the placeholder tokens.
|
Reconstructs the embeddings from the placeholder tokens.
|
||||||
|
|
||||||
This is the operation of [scatter_mm_placeholders][].
|
This is the operation of [`scatter_mm_placeholders`]
|
||||||
|
[vllm.v1.worker.utils.scatter_mm_placeholders].
|
||||||
"""
|
"""
|
||||||
if is_embed is None:
|
if is_embed is None:
|
||||||
return placeholders
|
return placeholders
|
||||||
|
|||||||
@ -1810,7 +1810,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
|
|||||||
|
|
||||||
return [output]
|
return [output]
|
||||||
|
|
||||||
def need_recv_kv(self, model_input, kv_caches) -> bool:
|
def need_recv_kv(self, model_input: ModelInputForGPUWithSamplingMetadata,
|
||||||
|
kv_caches: List[torch.Tensor]) -> bool:
|
||||||
"""Check if we need to receive kv-cache from the other worker.
|
"""Check if we need to receive kv-cache from the other worker.
|
||||||
We need to receive KV when
|
We need to receive KV when
|
||||||
1. current vLLM instance is KV cache consumer/decode vLLM instance
|
1. current vLLM instance is KV cache consumer/decode vLLM instance
|
||||||
@ -1825,6 +1826,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
|
|||||||
if self.vllm_config.kv_transfer_config is None:
|
if self.vllm_config.kv_transfer_config is None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if model_input.attn_metadata is None:
|
||||||
|
raise ValueError("model_input.attn_metadata cannot be None")
|
||||||
|
|
||||||
prefill_meta = model_input.attn_metadata.prefill_metadata
|
prefill_meta = model_input.attn_metadata.prefill_metadata
|
||||||
|
|
||||||
# check if the current run is profiling
|
# check if the current run is profiling
|
||||||
@ -1835,7 +1839,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
|
|||||||
return self.vllm_config.kv_transfer_config.is_kv_consumer and (
|
return self.vllm_config.kv_transfer_config.is_kv_consumer and (
|
||||||
not is_profile_run) and is_prefill_run
|
not is_profile_run) and is_prefill_run
|
||||||
|
|
||||||
def need_send_kv(self, model_input, kv_caches) -> bool:
|
def need_send_kv(self, model_input: ModelInputForGPUWithSamplingMetadata,
|
||||||
|
kv_caches: List[torch.Tensor]) -> bool:
|
||||||
"""Check if we need to send kv-cache to the other worker.
|
"""Check if we need to send kv-cache to the other worker.
|
||||||
We need to send KV when
|
We need to send KV when
|
||||||
1. current vLLM instance is KV cache producer/prefill vLLM instance
|
1. current vLLM instance is KV cache producer/prefill vLLM instance
|
||||||
@ -1850,6 +1855,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
|
|||||||
if self.vllm_config.kv_transfer_config is None:
|
if self.vllm_config.kv_transfer_config is None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if model_input.attn_metadata is None:
|
||||||
|
raise ValueError("model_input.attn_metadata cannot be None")
|
||||||
|
|
||||||
prefill_meta = model_input.attn_metadata.prefill_metadata
|
prefill_meta = model_input.attn_metadata.prefill_metadata
|
||||||
|
|
||||||
# check if the current run is profiling
|
# check if the current run is profiling
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user