mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-07 16:49:09 +08:00
[Doc]: fix typos in various files (#24726)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
0377802c20
commit
bcb06d7baf
@ -56,7 +56,7 @@ def w8a8_block_matmul(
|
|||||||
Bs: The per-block quantization scale for `B`.
|
Bs: The per-block quantization scale for `B`.
|
||||||
block_size: The block size for per-block quantization.
|
block_size: The block size for per-block quantization.
|
||||||
It should be 2-dim, e.g., [128, 128].
|
It should be 2-dim, e.g., [128, 128].
|
||||||
output_dytpe: The dtype of the returned tensor.
|
output_dtype: The dtype of the returned tensor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.Tensor: The result of matmul.
|
torch.Tensor: The result of matmul.
|
||||||
|
|||||||
@ -12,7 +12,7 @@ namespace vec_op {
|
|||||||
#define vec_sub(a, b) ((a) - (b))
|
#define vec_sub(a, b) ((a) - (b))
|
||||||
#define vec_mul(a, b) ((a) * (b))
|
#define vec_mul(a, b) ((a) * (b))
|
||||||
#define vec_div(a, b) ((a) / (b))
|
#define vec_div(a, b) ((a) / (b))
|
||||||
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic
|
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
|
||||||
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
|
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
|
||||||
|
|
||||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||||
|
|||||||
@ -215,7 +215,7 @@ int moe_align_block_size(
|
|||||||
offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
|
offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// TODO: do we need to vecterize this ?
|
// TODO: do we need to vectorize this ?
|
||||||
for (int mb = 0; mb < num_token_blocks; ++mb) {
|
for (int mb = 0; mb < num_token_blocks; ++mb) {
|
||||||
offsets[mb + 1] += offsets[mb];
|
offsets[mb + 1] += offsets[mb];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -8,7 +8,7 @@ page for information on known issues and how to solve them.
|
|||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
The source code references are to the state of the code at the time of writing in December, 2024.
|
The source code references are to the state of the code at the time of writing in December 2024.
|
||||||
|
|
||||||
The use of Python multiprocessing in vLLM is complicated by:
|
The use of Python multiprocessing in vLLM is complicated by:
|
||||||
|
|
||||||
|
|||||||
@ -901,7 +901,7 @@ def _get_query_key_seq_metadata(
|
|||||||
attn_metadata.encoder_seq_start_loc,
|
attn_metadata.encoder_seq_start_loc,
|
||||||
attn_metadata.max_encoder_seq_len)
|
attn_metadata.max_encoder_seq_len)
|
||||||
elif attn_type == AttentionType.ENCODER:
|
elif attn_type == AttentionType.ENCODER:
|
||||||
# For encoder attention both the query and the key are same i.e the
|
# For encoder attention both the query and the key are same i.e. the
|
||||||
# encoder sequence.
|
# encoder sequence.
|
||||||
return (attn_metadata.encoder_seq_start_loc,
|
return (attn_metadata.encoder_seq_start_loc,
|
||||||
attn_metadata.max_encoder_seq_len,
|
attn_metadata.max_encoder_seq_len,
|
||||||
|
|||||||
@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset):
|
|||||||
[6880, 6881] -> ['Ġcalls', 'here'] ->
|
[6880, 6881] -> ['Ġcalls', 'here'] ->
|
||||||
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
||||||
To avoid uncontrolled change of the prompt length,
|
To avoid uncontrolled change of the prompt length,
|
||||||
the encoded sequence is truncated before being decode again.
|
the encoded sequence is truncated before being decoded again.
|
||||||
"""
|
"""
|
||||||
# Build the inner sequence by sampling sequentially from the vocab
|
# Build the inner sequence by sampling sequentially from the vocab
|
||||||
inner_seq = ((offset + index + np.arange(input_len))
|
inner_seq = ((offset + index + np.arange(input_len))
|
||||||
|
|||||||
@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
|
|||||||
elif processors:
|
elif processors:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The `logits_processors` argument is not supported by this "
|
"The `logits_processors` argument is not supported by this "
|
||||||
"server. See --logits-processor-pattern engine argugment "
|
"server. See --logits-processor-pattern engine argument "
|
||||||
"for more information.")
|
"for more information.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp):
|
|||||||
# - the weight already has a "weight_loader" attribute
|
# - the weight already has a "weight_loader" attribute
|
||||||
# which set_weight_attrs will raise if we do not
|
# which set_weight_attrs will raise if we do not
|
||||||
# delete before trying to override it
|
# delete before trying to override it
|
||||||
# - ditto for the otther two weights below
|
# - ditto for the other two weights below
|
||||||
delattr(self.conv1d.bias, "weight_loader")
|
delattr(self.conv1d.bias, "weight_loader")
|
||||||
set_weight_attrs(
|
set_weight_attrs(
|
||||||
self.conv1d.bias,
|
self.conv1d.bias,
|
||||||
|
|||||||
@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
|
|
||||||
def _process_multimodal_inputs(self, modalities: dict):
|
def _process_multimodal_inputs(self, modalities: dict):
|
||||||
# The result multimodal_embeddings is tuple of tensors, with each
|
# The result multimodal_embeddings is tuple of tensors, with each
|
||||||
# tensor correspoending to a multimodal data item (image or video).
|
# tensor corresponding to a multimodal data item (image or video).
|
||||||
multimodal_embeddings: tuple[torch.Tensor, ...] = ()
|
multimodal_embeddings: tuple[torch.Tensor, ...] = ()
|
||||||
|
|
||||||
# NOTE: It is important to iterate over the keys in this dictionary
|
# NOTE: It is important to iterate over the keys in this dictionary
|
||||||
|
|||||||
@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
num_tokens += num_pad
|
num_tokens += num_pad
|
||||||
|
|
||||||
# If cudagraph_mode.decode_mode() == FULL and
|
# If cudagraph_mode.decode_mode() == FULL and
|
||||||
# cudagraph_mode.seperate_routine(). This means that we are using
|
# cudagraph_mode.separate_routine(). This means that we are using
|
||||||
# different graphs and/or modes for mixed prefill-decode batches vs.
|
# different graphs and/or modes for mixed prefill-decode batches vs.
|
||||||
# uniform decode batches. A uniform decode batch means that all
|
# uniform decode batches. A uniform decode batch means that all
|
||||||
# requests have identical query length, except a potential virtual
|
# requests have identical query length, except a potential virtual
|
||||||
|
|||||||
@ -392,7 +392,7 @@ class InputBatch:
|
|||||||
# NOTE: the following is unsafe
|
# NOTE: the following is unsafe
|
||||||
# self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
|
# self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
|
||||||
# self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
|
# self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
|
||||||
# instead, we need to temporiarily copy the data for one of the indices
|
# instead, we need to temporarily copy the data for one of the indices
|
||||||
# TODO(lucas): optimize this by only copying valid indices
|
# TODO(lucas): optimize this by only copying valid indices
|
||||||
tmp = self.token_ids_cpu[i1, ...].copy()
|
tmp = self.token_ids_cpu[i1, ...].copy()
|
||||||
self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
|
self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user