diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 0474db0820c73..5f1b268a1d6fe 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -170,7 +170,7 @@ def _support_torch_compile( # e.g. TPU has the compilation logic in model runner, so we don't # need to compile the model inside. if self.do_not_compile or torch.compiler.is_compiling(): - logger.info("SKIPPING COMPILATION") + # logger.info("SKIPPING COMPILATION") return self.forward(*args, **kwargs) # the first compilation needs to have dynamic shapes marked diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 88be86749d8d3..788aa0c537af5 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -337,14 +337,6 @@ class FlashAttentionMetadataBuilder( # populated on first build() call. self.aot_sliding_window: Optional[tuple[int, int]] = None - def build( - self, common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata - ) -> FlashAttentionMetadata: - num_reqs = common_attn_metadata.num_reqs - num_actual_tokens = common_attn_metadata.num_actual_tokens - max_query_len = common_attn_metadata.max_query_len - def build_slice( self, req_slice: slice, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index fbd95423b1c3b..fa9e94ad189ac 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -610,7 +610,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - assert self._num_decodes + self._num_prefills == num_reqs + # assert self._num_decodes + self._num_prefills == num_reqs return self.build_slice( req_slice=slice(0, num_reqs), token_slice=slice(0, num_actual_tokens),