diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0c4fae1dde5d1..e7655b6c30263 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -36,6 +36,7 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 @@ -1096,7 +1097,6 @@ class EngineArgs: If VLLM_USE_V1 is specified by the user but the VllmConfig is incompatible, we raise an error. """ - from vllm.platforms import current_platform current_platform.pre_register_and_update() device_config = DeviceConfig( @@ -1123,9 +1123,16 @@ class EngineArgs: # Set default arguments for V0 or V1 Engine. if use_v1: self._set_default_args_v1(usage_context, model_config) + # Disable chunked prefill for POWER (ppc64le)/ARM CPUs in V1 + if current_platform.is_cpu( + ) and current_platform.get_cpu_architecture() in ( + CpuArchEnum.POWERPC, CpuArchEnum.ARM): + logger.info( + "Chunked prefill is not supported for ARM and POWER CPUs; " + "disabling it for V1 backend.") + self.enable_chunked_prefill = False else: self._set_default_args_v0(model_config) - assert self.enable_chunked_prefill is not None if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]: @@ -1242,7 +1249,6 @@ class EngineArgs: if self.enable_chunked_prefill and self.pipeline_parallel_size > 1: raise ValueError("Multi-Step Chunked-Prefill is not supported " "for pipeline-parallel-size > 1") - from vllm.platforms import current_platform if current_platform.is_cpu(): logger.warning("Multi-Step (--num-scheduler-steps > 1) is " "currently not supported for CPUs and has been " @@ -1391,7 +1397,6 @@ class EngineArgs: # Skip this check if we are running on a non-GPU platform, # or if the device capability is not available # (e.g. in a Ray actor without GPUs). - from vllm.platforms import current_platform if (current_platform.is_cuda() and current_platform.get_device_capability() and current_platform.get_device_capability().major < 8): @@ -1652,7 +1657,6 @@ class EngineArgs: # as the platform that vLLM is running on (e.g. the case of scaling # vLLM with Ray) and has no GPUs. In this case we use the default # values for non-H100/H200 GPUs. - from vllm.platforms import current_platform try: device_memory = current_platform.get_device_total_memory() device_name = current_platform.get_device_name().lower() @@ -1755,7 +1759,6 @@ class AsyncEngineArgs(EngineArgs): parser.add_argument('--disable-log-requests', action='store_true', help='Disable logging requests.') - from vllm.platforms import current_platform current_platform.pre_register_and_update(parser) return parser diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index e999a58320228..913cb0895bb94 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -271,5 +271,6 @@ class CpuPlatform(Platform): """Returns whether the current platform can use v1 by default for the supplied model configuration. """ - return cls.supports_v1( - model_config) and cls.get_cpu_architecture() == CpuArchEnum.X86 + arch = cls.get_cpu_architecture() + return (cls.supports_v1(model_config) and arch + in (CpuArchEnum.X86, CpuArchEnum.POWERPC, CpuArchEnum.ARM)) diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 08e802958b69c..d6270fbf31969 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -316,7 +316,6 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): block_table: BlockTable) -> None: self.runner = runner self.block_table = block_table - # For reorder self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs, dtype=np.int64) @@ -401,11 +400,14 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, slot_mapping=slot_mapping, + # to ensure inference when chunked_prefill is disabled + seq_lens=runner.seq_lens_cpu[:num_reqs].tolist(), seq_lens_tensor=runner. seq_lens_cpu[num_prompt_req:num_reqs], # decode max_decode_seq_len=max_decode_seq_len, # decode block_tables=block_table_tensor[num_prompt_req:num_reqs], # decode - chunked_prefill=True, + chunked_prefill=self.runner.scheduler_config. + chunked_prefill_enabled, max_query_len=max_query_len, max_kv_len=max_prefill_seq_len, prefill_query_start_loc=runner. diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 7712b7974544f..0bd3e580ba039 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -11,7 +11,7 @@ from vllm.config import VllmConfig from vllm.distributed.parallel_state import get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.model_executor.utils import set_random_seed -from vllm.platforms import current_platform +from vllm.platforms import CpuArchEnum, current_platform from vllm.sequence import IntermediateTensors from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput @@ -43,8 +43,12 @@ class CPUWorker(Worker): omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND self.local_omp_cpuid = "all" if omp_cpuids == "auto": - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( - ) + if current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC: + self.local_omp_cpuid = ( + self.get_cpus_id_binding_based_on_numa_nodes_ppc64le()) + else: + self.local_omp_cpuid = ( + self.get_cpus_id_binding_based_on_numa_nodes()) else: self.local_omp_cpuid = omp_cpuids.split("|")[self.rank] @@ -153,3 +157,57 @@ class CPUWorker(Worker): "fallback to no thread-binding. To get better performance," "please try to manually bind threads.") return rank_to_cpus + + def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str: + """ + Power (ppc64le) specific: Selects a subset of threads per core for + each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc) + because the OS only exposes available threads.This maximizes + performance by avoiding oversubscription of logical CPUs on Power. + """ + + def select_threads_per_power_core(node_cpu_ids): + return [cpu for cpu in node_cpu_ids if cpu % 8 < 4] + + rank_to_cpus = self.local_omp_cpuid + world_size = self.vllm_config.parallel_config.world_size + libnuma_found = util.find_spec("numa") is not None + psutil_found = util.find_spec("psutil") is not None + if libnuma_found and psutil_found: + import psutil + from numa import info + cpus_allow_list = psutil.Process().cpu_affinity() + numa_size = info.get_num_configured_nodes() + + node_to_cpus = [] + for i in range(numa_size): + node_intersect = set( + info.node_to_cpus(i)).intersection(cpus_allow_list) + if bool(node_intersect): + node_to_cpus.append(sorted(list(node_intersect))) + + if world_size > len(node_to_cpus): + logger.error( + "Auto thread-binding failed due to " + "world size: %d is larger than " + "allowed NUMA nodes number: %d." + "Please try to bind threads manually.", world_size, + len(node_to_cpus)) + else: + node_cpus_this_rank = node_to_cpus[self.rank] + node_cpus_this_rank = select_threads_per_power_core( + node_cpus_this_rank) + cpu_count_per_numa = len(node_cpus_this_rank) + num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, + cpu_count_per_numa // 2) + end = cpu_count_per_numa - num_of_reserved_cpu + rank_to_cpus_list = node_cpus_this_rank[:end] + rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) + logger.info("ppc64le thread-binding list: %s", rank_to_cpus) + else: + logger.warning( + "Auto thread-binding is not supported due to " + "the lack of package numa and psutil," + "fallback to no thread-binding. To get better performance," + "please try to manually bind threads.") + return rank_to_cpus