mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-20 11:31:15 +08:00
Fix typos in comments across multiple files (#30345)
Signed-off-by: Wilson Wu <iwilsonwu@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
parent
06462392e4
commit
3bdd426636
@ -186,7 +186,7 @@ struct AttentionMetadata {
|
||||
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
||||
// * q_tile_size * 4, partial output, max + sum (float)
|
||||
// Reduction scratchpad contains:
|
||||
// - flags: bool array to indicate wether the split is finished
|
||||
// - flags: bool array to indicate whether the split is finished
|
||||
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
||||
// - max, sum: 2 * split_num * q_tile_size * 4
|
||||
class AttentionScratchPad {
|
||||
|
||||
@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
|
||||
|
||||
// Same as upstream, should be kept the same when possible, not formatted for
|
||||
// easier comparison
|
||||
// with `SwapAB ? N : M -> M` since we dont support SwapAB
|
||||
// with `SwapAB ? N : M -> M` since we don't support SwapAB
|
||||
// clang-format off
|
||||
template<class ProblemShape>
|
||||
static bool
|
||||
|
||||
@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
|
||||
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
|
||||
|
||||
```bash
|
||||
# Example UCX configuration, adjust according to your enviroment
|
||||
# Example UCX configuration, adjust according to your environment
|
||||
export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
|
||||
export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1"
|
||||
```
|
||||
|
||||
@ -881,7 +881,7 @@ class FusedMoE(CustomOp):
|
||||
# Record that the clone will be used by shared_experts_stream
|
||||
# to avoid gc issue from deallocation of hidden_states_clone
|
||||
# For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
|
||||
# NOTE: We dont need shared_output.record_stream(current_stream())
|
||||
# NOTE: We don't need shared_output.record_stream(current_stream())
|
||||
# because we synch the streams before using shared_output.
|
||||
hidden_states_clone.record_stream(self.shared_experts_stream)
|
||||
|
||||
|
||||
@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
|
||||
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
# dont restrict as emulations
|
||||
# don't restrict as emulations
|
||||
return 80
|
||||
|
||||
def create_weights(
|
||||
|
||||
@ -4871,7 +4871,7 @@ class GPUModelRunner(
|
||||
# we need to adjust the cudagraph sizes to be a multiple of the uniform
|
||||
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
|
||||
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
|
||||
# Will be removed in the near future when we have seperate cudagraph capture
|
||||
# Will be removed in the near future when we have separate cudagraph capture
|
||||
# sizes for decode and mixed prefill-decode.
|
||||
if (
|
||||
cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
|
||||
|
||||
@ -135,7 +135,7 @@ class AttentionGroup:
|
||||
kv_cache_spec: KVCacheSpec
|
||||
kv_cache_group_id: int
|
||||
# When ubatching is enabled we will have a metadata builder for each ubatch
|
||||
# so that if they use internal persistant buffers for cudagraphs, and they
|
||||
# so that if they use internal persistent buffers for cudagraphs, and they
|
||||
# won't have to worry about conflicting with the other ubatches.
|
||||
metadata_builders: list[AttentionMetadataBuilder] = field(
|
||||
default_factory=lambda: []
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user