diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md index 41e68e047be8..a28c6956be0e 100644 --- a/benchmarks/kernels/deepgemm/README.md +++ b/benchmarks/kernels/deepgemm/README.md @@ -2,7 +2,7 @@ This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels. -Currently this just includes dense GEMMs and only works on Hopper GPUs. +Currently, this just includes dense GEMMs and only works on Hopper GPUs. ## Setup diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index d64e315b4fe3..8a3599416bc7 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -96,7 +96,7 @@ class VllmConfig: """`torch.compile` and cudagraph capture configuration for the model. As a shorthand, one can append compilation arguments via - -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`). + -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`). You can specify the full compilation config like so: `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 25fb7181a8f2..7cb490e391ab 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -153,7 +153,7 @@ class DPMetadata: @contextmanager def sp_local_sizes(self, sequence_parallel_size: int): """ - Context mamager for setting self.local_sizes. Same as self.chunked_sizes + Context manager for setting self.local_sizes. Same as self.chunked_sizes but without any chunking. """ self.local_sizes = _compute_sp_num_tokens( diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index d6fef450c028..4a2818ab1bfd 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -525,7 +525,7 @@ class InputBatch: # NOTE: the following is unsafe # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\ # self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...] - # instead, we need to temporiarily copy the data for one of the indices + # instead, we need to temporarily copy the data for one of the indices # TODO(lucas): optimize this by only copying valid indices tmp = self.token_ids_cpu[i1, ...].copy() self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]