diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index fe9e7a9891941..e6c7150500cc3 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -855,7 +855,8 @@ class SinkFullAttentionManager(FullAttentionManager): req_blocks = self.req_to_blocks[request_id] assert len(req_blocks) == 0 # Append both sink blocks and hitted prefix cache blocks - req_blocks.extend(self.sink_blocks + new_computed_blocks) + req_blocks.extend(self.sink_blocks) + req_blocks.extend(new_computed_blocks) self.num_cached_block[request_id] = len(new_computed_blocks) else: # A running request. Should not have new computed blocks. diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 6845652688430..604edf3d7ae29 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -102,8 +102,8 @@ def _reshape_kv_cache( attn_backend = attn_backends[layer_name] if ( - getattr(kv_cache_spec, "head_size_v", kv_cache_spec.head_size) - != kv_cache_spec.head_size + hasattr(kv_cache_spec, "head_size_v") + and kv_cache_spec.head_size_v != kv_cache_spec.head_size ): kwargs = {"head_size_v": kv_cache_spec.head_size_v} stride_kwargs = {"diff_kv": True} diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0bf773c84596d..e02e7ee2d4c26 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5207,8 +5207,8 @@ class GPUModelRunner( kernel_num_blocks = num_blocks * num_blocks_per_kv_block if ( - getattr(kv_cache_spec, "head_size_v", kv_cache_spec.head_size) - != kv_cache_spec.head_size + hasattr(kv_cache_spec, "head_size_v") + and kv_cache_spec.head_size_v != kv_cache_spec.head_size ): kwargs = {"head_size_v": kv_cache_spec.head_size_v} stride_kwargs = {"diff_kv": True} diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index a31013502c377..a45b83dc788f1 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -191,8 +191,8 @@ class KVConnectorModelRunnerMixin: attn_backend = attn_group.backend if ( - getattr(kv_cache_spec, "head_size_v", kv_cache_spec.head_size) - != kv_cache_spec.head_size + hasattr(kv_cache_spec, "head_size_v") + and kv_cache_spec.head_size_v != kv_cache_spec.head_size ): kwargs = {"head_size_v": kv_cache_spec.head_size_v} stride_kwargs = {"diff_kv": True} @@ -269,8 +269,8 @@ class KVConnectorModelRunnerMixin: attn_backend = attn_group.backend if ( - getattr(kv_cache_spec, "head_size_v", kv_cache_spec.head_size) - != kv_cache_spec.head_size + hasattr(kv_cache_spec, "head_size_v") + and kv_cache_spec.head_size_v != kv_cache_spec.head_size ): kwargs = {"head_size_v": kv_cache_spec.head_size_v} stride_kwargs = {"diff_kv": True}