mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 08:04:27 +08:00
fix some typos (#24616)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
This commit is contained in:
parent
c1eda615ba
commit
e26fef8397
@ -23,7 +23,7 @@ class TestSetting:
|
|||||||
fullgraph: bool
|
fullgraph: bool
|
||||||
|
|
||||||
|
|
||||||
# we cannot afford testing the full Catesian product
|
# we cannot afford testing the full Cartesian product
|
||||||
# of all models and all levels
|
# of all models and all levels
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"test_setting",
|
"test_setting",
|
||||||
|
|||||||
@ -345,7 +345,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
|||||||
# in the mamba2 ssd kernels, by comparing concatenation (in the sequence
|
# in the mamba2 ssd kernels, by comparing concatenation (in the sequence
|
||||||
# dimension) of chunked results with the full sequence result.
|
# dimension) of chunked results with the full sequence result.
|
||||||
# It is different from test_mamba_chunk_scan_cont_batch by:
|
# It is different from test_mamba_chunk_scan_cont_batch by:
|
||||||
# 1. Not using the naive torch implementaion (ssd_minimal_discrete) to get
|
# 1. Not using the naive torch implementation (ssd_minimal_discrete) to get
|
||||||
# reference outputs. Instead, it compares chunked kernel outputs to full
|
# reference outputs. Instead, it compares chunked kernel outputs to full
|
||||||
# sequence kernel outputs. This is the most straightforward way to
|
# sequence kernel outputs. This is the most straightforward way to
|
||||||
# assert chunked prefill correctness.
|
# assert chunked prefill correctness.
|
||||||
|
|||||||
@ -179,7 +179,7 @@ def chunk_local_cumsum_vector(
|
|||||||
def grid(meta):
|
def grid(meta):
|
||||||
return (triton.cdiv(meta['S'], meta['BS']), NT, B * H)
|
return (triton.cdiv(meta['S'], meta['BS']), NT, B * H)
|
||||||
|
|
||||||
# keep cummulative normalizer in fp32
|
# keep cumulative normalizer in fp32
|
||||||
# this kernel is equivalent to
|
# this kernel is equivalent to
|
||||||
# g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
|
# g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
|
||||||
chunk_local_cumsum_vector_kernel[grid](g_org,
|
chunk_local_cumsum_vector_kernel[grid](g_org,
|
||||||
|
|||||||
@ -1322,7 +1322,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
|
|||||||
k_scale: torch.Tensor,
|
k_scale: torch.Tensor,
|
||||||
dcp_world_size: int,
|
dcp_world_size: int,
|
||||||
):
|
):
|
||||||
assert k_scale is None, "DCP not support sacled kvcache now."
|
assert k_scale is None, "DCP not support scaled kvcache now."
|
||||||
assert attn_metadata.prefill is not None
|
assert attn_metadata.prefill is not None
|
||||||
prefill_metadata = attn_metadata.prefill
|
prefill_metadata = attn_metadata.prefill
|
||||||
assert prefill_metadata.chunked_context is not None
|
assert prefill_metadata.chunked_context is not None
|
||||||
|
|||||||
@ -112,9 +112,9 @@ class BlockTable:
|
|||||||
# tokens.
|
# tokens.
|
||||||
virtual_block_offsets = positions % virtual_block_size
|
virtual_block_offsets = positions % virtual_block_size
|
||||||
mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
|
mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
|
||||||
# Calcuate local block_offsets
|
# Calculate local block_offsets
|
||||||
block_offsets = virtual_block_offsets // self.dcp_world_size
|
block_offsets = virtual_block_offsets // self.dcp_world_size
|
||||||
# Calcuate slot_mapping
|
# Calculate slot_mapping
|
||||||
slot_mapping = block_numbers * self.block_size + block_offsets
|
slot_mapping = block_numbers * self.block_size + block_offsets
|
||||||
# Write final slots, use -1 for not-local
|
# Write final slots, use -1 for not-local
|
||||||
self.slot_mapping_np[:req_indices.shape[0]] = np.where(
|
self.slot_mapping_np[:req_indices.shape[0]] = np.where(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user