mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 11:37:12 +08:00
[XPU] Upgrade NIXL to remove CUDA dependency (#26570)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
This commit is contained in:
parent
8f8474fbe3
commit
27ed39a347
@ -44,6 +44,5 @@ docker run \
|
|||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_metrics
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
# install nixl from source code
|
||||||
|
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
@ -10,7 +10,6 @@ wheel
|
|||||||
jinja2>=3.1.6
|
jinja2>=3.1.6
|
||||||
datasets # for benchmark scripts
|
datasets # for benchmark scripts
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
nixl==0.3.0 # for PD disaggregation
|
|
||||||
torch==2.8.0+xpu
|
torch==2.8.0+xpu
|
||||||
torchaudio
|
torchaudio
|
||||||
torchvision
|
torchvision
|
||||||
|
|||||||
@ -135,6 +135,7 @@ def build_and_install_prerequisites(args):
|
|||||||
"--enable-devel-headers",
|
"--enable-devel-headers",
|
||||||
"--with-verbs",
|
"--with-verbs",
|
||||||
"--enable-mt",
|
"--enable-mt",
|
||||||
|
"--with-ze=no",
|
||||||
]
|
]
|
||||||
run_command(configure_command, cwd=ucx_source_path)
|
run_command(configure_command, cwd=ucx_source_path)
|
||||||
run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
|
run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
|
||||||
|
|||||||
@ -54,6 +54,14 @@ class XPUPlatform(Platform):
|
|||||||
has_sink: bool,
|
has_sink: bool,
|
||||||
use_sparse,
|
use_sparse,
|
||||||
) -> str:
|
) -> str:
|
||||||
|
from vllm.v1.attention.backends.utils import set_kv_cache_layout
|
||||||
|
|
||||||
|
set_kv_cache_layout("NHD")
|
||||||
|
logger.info(
|
||||||
|
"Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
|
||||||
|
"only NHD layout is supported by XPU attention kernels."
|
||||||
|
)
|
||||||
|
|
||||||
from vllm.attention.backends.registry import _Backend
|
from vllm.attention.backends.registry import _Backend
|
||||||
|
|
||||||
if use_sparse:
|
if use_sparse:
|
||||||
@ -190,13 +198,6 @@ class XPUPlatform(Platform):
|
|||||||
vllm_config.scheduler_config.max_model_len,
|
vllm_config.scheduler_config.max_model_len,
|
||||||
DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
||||||
)
|
)
|
||||||
from vllm.v1.attention.backends.utils import set_kv_cache_layout
|
|
||||||
|
|
||||||
set_kv_cache_layout("NHD")
|
|
||||||
logger.info(
|
|
||||||
"Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
|
|
||||||
"only NHD layout is supported by XPU attention kernels."
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def support_hybrid_kv_cache(cls) -> bool:
|
def support_hybrid_kv_cache(cls) -> bool:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user