mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 13:15:58 +08:00
[XPU][CI] enhance xpu test support (#20652)
Signed-off-by: Ma, Liangliang <liangliang.ma@intel.com> Co-authored-by: zhenwei-intel <zhenweiliu@habana.ai>
This commit is contained in:
parent
eb58f5953d
commit
a3e4e85ece
@ -759,7 +759,8 @@ class VllmRunner:
|
|||||||
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
||||||
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
||||||
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
|
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
|
||||||
- `block_size`: Set to `16` instead of `None` to reduce memory usage.
|
- `block_size`: To reduce memory usage, set default to `64` if on XPU
|
||||||
|
devices, otherwise default to `16`.
|
||||||
- `enable_chunked_prefill`: Set to `False` instead of `None` for
|
- `enable_chunked_prefill`: Set to `False` instead of `None` for
|
||||||
test reproducibility.
|
test reproducibility.
|
||||||
- `enforce_eager`: Set to `False` to test CUDA graph.
|
- `enforce_eager`: Set to `False` to test CUDA graph.
|
||||||
@ -777,7 +778,7 @@ class VllmRunner:
|
|||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
disable_log_stats: bool = True,
|
disable_log_stats: bool = True,
|
||||||
tensor_parallel_size: int = 1,
|
tensor_parallel_size: int = 1,
|
||||||
block_size: int = 16,
|
block_size: int = 16 if not torch.xpu.is_available() else 64,
|
||||||
enable_chunked_prefill: Optional[bool] = False,
|
enable_chunked_prefill: Optional[bool] = False,
|
||||||
swap_space: int = 4,
|
swap_space: int = 4,
|
||||||
enforce_eager: Optional[bool] = False,
|
enforce_eager: Optional[bool] = False,
|
||||||
|
|||||||
@ -53,3 +53,6 @@ class XpuCommunicator(DeviceCommunicatorBase):
|
|||||||
else:
|
else:
|
||||||
output_tensor = None
|
output_tensor = None
|
||||||
return output_tensor
|
return output_tensor
|
||||||
|
|
||||||
|
def broadcast(self, input_: torch.Tensor, src: int = 0) -> None:
|
||||||
|
dist.broadcast(input_, src=src, group=self.device_group)
|
||||||
|
|||||||
@ -240,6 +240,8 @@ class GroupCoordinator:
|
|||||||
|
|
||||||
if current_platform.is_cuda_alike():
|
if current_platform.is_cuda_alike():
|
||||||
self.device = torch.device(f"cuda:{local_rank}")
|
self.device = torch.device(f"cuda:{local_rank}")
|
||||||
|
elif current_platform.is_xpu():
|
||||||
|
self.device = torch.device(f"xpu:{local_rank}")
|
||||||
elif current_platform.is_out_of_tree():
|
elif current_platform.is_out_of_tree():
|
||||||
self.device = torch.device(
|
self.device = torch.device(
|
||||||
f"{current_platform.device_name}:{local_rank}")
|
f"{current_platform.device_name}:{local_rank}")
|
||||||
|
|||||||
@ -91,6 +91,7 @@ class XPUPlatform(Platform):
|
|||||||
|
|
||||||
# FIXME: Temporarily forcing eager mode
|
# FIXME: Temporarily forcing eager mode
|
||||||
# remove after t.compile support stabilizes.
|
# remove after t.compile support stabilizes.
|
||||||
|
|
||||||
if (envs.VLLM_USE_V1 and vllm_config.model_config is not None
|
if (envs.VLLM_USE_V1 and vllm_config.model_config is not None
|
||||||
and not vllm_config.model_config.enforce_eager):
|
and not vllm_config.model_config.enforce_eager):
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
@ -111,9 +112,6 @@ class XPUPlatform(Platform):
|
|||||||
"mode.")
|
"mode.")
|
||||||
model_config.enforce_eager = True
|
model_config.enforce_eager = True
|
||||||
|
|
||||||
if vllm_config.device_config is not None:
|
|
||||||
assert vllm_config.device_config.device_type == "xpu"
|
|
||||||
|
|
||||||
# check and update parallel config
|
# check and update parallel config
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
|
parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
|
||||||
@ -131,8 +129,10 @@ class XPUPlatform(Platform):
|
|||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Please use spawn as start method if you want to use mp.")
|
"Please use spawn as start method if you want to use mp.")
|
||||||
elif parallel_config.distributed_executor_backend != "ray" and \
|
elif (parallel_config.distributed_executor_backend != "ray"
|
||||||
parallel_config.distributed_executor_backend != "uni":
|
and parallel_config.distributed_executor_backend != "uni"
|
||||||
|
and parallel_config.distributed_executor_backend
|
||||||
|
!= "external_launcher"):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"%s is not supported on XPU, fallback to ray distributed"
|
"%s is not supported on XPU, fallback to ray distributed"
|
||||||
" executor backend.",
|
" executor backend.",
|
||||||
|
|||||||
@ -27,7 +27,7 @@ class XPUModelRunner(GPUModelRunner):
|
|||||||
self.cascade_attn_enabled = False
|
self.cascade_attn_enabled = False
|
||||||
|
|
||||||
def _init_device_properties(self) -> None:
|
def _init_device_properties(self) -> None:
|
||||||
pass
|
self.num_sms = None
|
||||||
|
|
||||||
def _sync_device(self) -> None:
|
def _sync_device(self) -> None:
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user