mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 05:34:55 +08:00
[Doc]: fix typos in various files (#24726)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
0377802c20
commit
bcb06d7baf
@ -56,7 +56,7 @@ def w8a8_block_matmul(
|
||||
Bs: The per-block quantization scale for `B`.
|
||||
block_size: The block size for per-block quantization.
|
||||
It should be 2-dim, e.g., [128, 128].
|
||||
output_dytpe: The dtype of the returned tensor.
|
||||
output_dtype: The dtype of the returned tensor.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The result of matmul.
|
||||
|
||||
@ -12,7 +12,7 @@ namespace vec_op {
|
||||
#define vec_sub(a, b) ((a) - (b))
|
||||
#define vec_mul(a, b) ((a) * (b))
|
||||
#define vec_div(a, b) ((a) / (b))
|
||||
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic
|
||||
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
|
||||
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
|
||||
|
||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||
|
||||
@ -215,7 +215,7 @@ int moe_align_block_size(
|
||||
offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
|
||||
}
|
||||
});
|
||||
// TODO: do we need to vecterize this ?
|
||||
// TODO: do we need to vectorize this ?
|
||||
for (int mb = 0; mb < num_token_blocks; ++mb) {
|
||||
offsets[mb + 1] += offsets[mb];
|
||||
}
|
||||
|
||||
@ -8,7 +8,7 @@ page for information on known issues and how to solve them.
|
||||
## Introduction
|
||||
|
||||
!!! important
|
||||
The source code references are to the state of the code at the time of writing in December, 2024.
|
||||
The source code references are to the state of the code at the time of writing in December 2024.
|
||||
|
||||
The use of Python multiprocessing in vLLM is complicated by:
|
||||
|
||||
|
||||
@ -901,7 +901,7 @@ def _get_query_key_seq_metadata(
|
||||
attn_metadata.encoder_seq_start_loc,
|
||||
attn_metadata.max_encoder_seq_len)
|
||||
elif attn_type == AttentionType.ENCODER:
|
||||
# For encoder attention both the query and the key are same i.e the
|
||||
# For encoder attention both the query and the key are same i.e. the
|
||||
# encoder sequence.
|
||||
return (attn_metadata.encoder_seq_start_loc,
|
||||
attn_metadata.max_encoder_seq_len,
|
||||
|
||||
@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset):
|
||||
[6880, 6881] -> ['Ġcalls', 'here'] ->
|
||||
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
||||
To avoid uncontrolled change of the prompt length,
|
||||
the encoded sequence is truncated before being decode again.
|
||||
the encoded sequence is truncated before being decoded again.
|
||||
"""
|
||||
# Build the inner sequence by sampling sequentially from the vocab
|
||||
inner_seq = ((offset + index + np.arange(input_len))
|
||||
|
||||
@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
|
||||
elif processors:
|
||||
raise ValueError(
|
||||
"The `logits_processors` argument is not supported by this "
|
||||
"server. See --logits-processor-pattern engine argugment "
|
||||
"server. See --logits-processor-pattern engine argument "
|
||||
"for more information.")
|
||||
return None
|
||||
|
||||
|
||||
@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
# - the weight already has a "weight_loader" attribute
|
||||
# which set_weight_attrs will raise if we do not
|
||||
# delete before trying to override it
|
||||
# - ditto for the otther two weights below
|
||||
# - ditto for the other two weights below
|
||||
delattr(self.conv1d.bias, "weight_loader")
|
||||
set_weight_attrs(
|
||||
self.conv1d.bias,
|
||||
|
||||
@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
|
||||
def _process_multimodal_inputs(self, modalities: dict):
|
||||
# The result multimodal_embeddings is tuple of tensors, with each
|
||||
# tensor correspoending to a multimodal data item (image or video).
|
||||
# tensor corresponding to a multimodal data item (image or video).
|
||||
multimodal_embeddings: tuple[torch.Tensor, ...] = ()
|
||||
|
||||
# NOTE: It is important to iterate over the keys in this dictionary
|
||||
|
||||
@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
num_tokens += num_pad
|
||||
|
||||
# If cudagraph_mode.decode_mode() == FULL and
|
||||
# cudagraph_mode.seperate_routine(). This means that we are using
|
||||
# cudagraph_mode.separate_routine(). This means that we are using
|
||||
# different graphs and/or modes for mixed prefill-decode batches vs.
|
||||
# uniform decode batches. A uniform decode batch means that all
|
||||
# requests have identical query length, except a potential virtual
|
||||
|
||||
@ -392,7 +392,7 @@ class InputBatch:
|
||||
# NOTE: the following is unsafe
|
||||
# self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
|
||||
# self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
|
||||
# instead, we need to temporiarily copy the data for one of the indices
|
||||
# instead, we need to temporarily copy the data for one of the indices
|
||||
# TODO(lucas): optimize this by only copying valid indices
|
||||
tmp = self.token_ids_cpu[i1, ...].copy()
|
||||
self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user