mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-24 04:47:03 +08:00
more fixes
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
0889f66297
commit
ff2dd13145
@ -170,7 +170,7 @@ def _support_torch_compile(
|
|||||||
# e.g. TPU has the compilation logic in model runner, so we don't
|
# e.g. TPU has the compilation logic in model runner, so we don't
|
||||||
# need to compile the model inside.
|
# need to compile the model inside.
|
||||||
if self.do_not_compile or torch.compiler.is_compiling():
|
if self.do_not_compile or torch.compiler.is_compiling():
|
||||||
logger.info("SKIPPING COMPILATION")
|
# logger.info("SKIPPING COMPILATION")
|
||||||
return self.forward(*args, **kwargs)
|
return self.forward(*args, **kwargs)
|
||||||
|
|
||||||
# the first compilation needs to have dynamic shapes marked
|
# the first compilation needs to have dynamic shapes marked
|
||||||
|
|||||||
@ -337,14 +337,6 @@ class FlashAttentionMetadataBuilder(
|
|||||||
# populated on first build() call.
|
# populated on first build() call.
|
||||||
self.aot_sliding_window: Optional[tuple[int, int]] = None
|
self.aot_sliding_window: Optional[tuple[int, int]] = None
|
||||||
|
|
||||||
def build(
|
|
||||||
self, common_prefix_len: int,
|
|
||||||
common_attn_metadata: CommonAttentionMetadata
|
|
||||||
) -> FlashAttentionMetadata:
|
|
||||||
num_reqs = common_attn_metadata.num_reqs
|
|
||||||
num_actual_tokens = common_attn_metadata.num_actual_tokens
|
|
||||||
max_query_len = common_attn_metadata.max_query_len
|
|
||||||
|
|
||||||
def build_slice(
|
def build_slice(
|
||||||
self,
|
self,
|
||||||
req_slice: slice,
|
req_slice: slice,
|
||||||
|
|||||||
@ -610,7 +610,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
|||||||
num_actual_tokens = common_attn_metadata.num_actual_tokens
|
num_actual_tokens = common_attn_metadata.num_actual_tokens
|
||||||
max_query_len = common_attn_metadata.max_query_len
|
max_query_len = common_attn_metadata.max_query_len
|
||||||
|
|
||||||
assert self._num_decodes + self._num_prefills == num_reqs
|
# assert self._num_decodes + self._num_prefills == num_reqs
|
||||||
return self.build_slice(
|
return self.build_slice(
|
||||||
req_slice=slice(0, num_reqs),
|
req_slice=slice(0, num_reqs),
|
||||||
token_slice=slice(0, num_actual_tokens),
|
token_slice=slice(0, num_actual_tokens),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user