mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-07 10:27:04 +08:00
more fixes
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
0889f66297
commit
ff2dd13145
@ -170,7 +170,7 @@ def _support_torch_compile(
|
||||
# e.g. TPU has the compilation logic in model runner, so we don't
|
||||
# need to compile the model inside.
|
||||
if self.do_not_compile or torch.compiler.is_compiling():
|
||||
logger.info("SKIPPING COMPILATION")
|
||||
# logger.info("SKIPPING COMPILATION")
|
||||
return self.forward(*args, **kwargs)
|
||||
|
||||
# the first compilation needs to have dynamic shapes marked
|
||||
|
||||
@ -337,14 +337,6 @@ class FlashAttentionMetadataBuilder(
|
||||
# populated on first build() call.
|
||||
self.aot_sliding_window: Optional[tuple[int, int]] = None
|
||||
|
||||
def build(
|
||||
self, common_prefix_len: int,
|
||||
common_attn_metadata: CommonAttentionMetadata
|
||||
) -> FlashAttentionMetadata:
|
||||
num_reqs = common_attn_metadata.num_reqs
|
||||
num_actual_tokens = common_attn_metadata.num_actual_tokens
|
||||
max_query_len = common_attn_metadata.max_query_len
|
||||
|
||||
def build_slice(
|
||||
self,
|
||||
req_slice: slice,
|
||||
|
||||
@ -610,7 +610,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
||||
num_actual_tokens = common_attn_metadata.num_actual_tokens
|
||||
max_query_len = common_attn_metadata.max_query_len
|
||||
|
||||
assert self._num_decodes + self._num_prefills == num_reqs
|
||||
# assert self._num_decodes + self._num_prefills == num_reqs
|
||||
return self.build_slice(
|
||||
req_slice=slice(0, num_reqs),
|
||||
token_slice=slice(0, num_actual_tokens),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user