diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f7adecbd88746..876114c2d33a4 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -403,7 +403,21 @@ class RocmPlatform(Platform): compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE if cache_config and cache_config.block_size is None: - cache_config.block_size = 16 + if ( + envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER + # NOTE: This block has been deprecated + # or get_env_variable_attn_backend() + # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN + # TODO: monitor https://github.com/vllm-project/vllm/pull/30396 + # to see how we can transition to the new way of selecting + # attention backends + ): + cache_config.block_size = 64 + logger.warning( + "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64." + ) + else: + cache_config.block_size = 16 if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"