mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-21 06:37:04 +08:00
[CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate multigpu tests (#23568)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
2a97ffc33d
commit
906e461ed6
@ -390,6 +390,7 @@ steps:
|
|||||||
- csrc/moe/
|
- csrc/moe/
|
||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
- vllm/model_executor/layers/fused_moe/
|
- vllm/model_executor/layers/fused_moe/
|
||||||
|
- vllm/distributed/device_communicators/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
|
|||||||
@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
|
|||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
tensor_model_parallel_reduce_scatter)
|
tensor_model_parallel_reduce_scatter)
|
||||||
|
|
||||||
from ..utils import init_test_distributed_environment, multi_process_parallel
|
from ..utils import (init_test_distributed_environment, multi_gpu_test,
|
||||||
|
multi_process_parallel)
|
||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
@ -226,8 +227,7 @@ def send_recv_test_worker(
|
|||||||
torch.testing.assert_close(test_tensor, recv_tensor)
|
torch.testing.assert_close(test_tensor, recv_tensor)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
@multi_gpu_test(num_gpus=2)
|
||||||
reason="Need at least 2 GPUs to run the test.")
|
|
||||||
@pytest.mark.parametrize("tp_size", [2])
|
@pytest.mark.parametrize("tp_size", [2])
|
||||||
@pytest.mark.parametrize("test_target", [
|
@pytest.mark.parametrize("test_target", [
|
||||||
all_reduce_test_worker, all_gather_test_worker,
|
all_reduce_test_worker, all_gather_test_worker,
|
||||||
@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
|
|||||||
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
|
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
@multi_gpu_test(num_gpus=2)
|
||||||
reason="Need at least 2 GPUs to run the test.")
|
|
||||||
@pytest.mark.parametrize("pp_size", [2])
|
@pytest.mark.parametrize("pp_size", [2])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
||||||
@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
|
|||||||
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
|
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
@multi_gpu_test(num_gpus=4)
|
||||||
reason="Need at least 4 GPUs to run the test.")
|
|
||||||
@pytest.mark.parametrize("tp_size", [2])
|
@pytest.mark.parametrize("tp_size", [2])
|
||||||
@pytest.mark.parametrize("pp_size", [2])
|
@pytest.mark.parametrize("pp_size", [2])
|
||||||
@pytest.mark.parametrize("test_target", [
|
@pytest.mark.parametrize("test_target", [
|
||||||
|
|||||||
@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm
|
|||||||
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
|
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
|
||||||
is_deep_gemm_supported)
|
is_deep_gemm_supported)
|
||||||
|
|
||||||
|
from ...utils import multi_gpu_test
|
||||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||||
from .utils import make_test_weights
|
from .utils import make_test_weights
|
||||||
|
|
||||||
@ -370,6 +371,7 @@ NUM_EXPERTS = [32]
|
|||||||
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
|
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
|
||||||
@pytest.mark.parametrize("topk", TOPKS)
|
@pytest.mark.parametrize("topk", TOPKS)
|
||||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
@requires_deep_ep
|
@requires_deep_ep
|
||||||
@requires_deep_gemm
|
@requires_deep_gemm
|
||||||
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
||||||
@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False]
|
|||||||
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
|
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
|
||||||
@pytest.mark.parametrize("block_size", [[128, 128]])
|
@pytest.mark.parametrize("block_size", [[128, 128]])
|
||||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
@requires_deep_ep
|
@requires_deep_ep
|
||||||
@requires_deep_gemm
|
@requires_deep_gemm
|
||||||
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import has_deep_ep
|
from vllm.utils import has_deep_ep
|
||||||
|
|
||||||
|
from ...utils import multi_gpu_test
|
||||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||||
|
|
||||||
if has_deep_ep():
|
if has_deep_ep():
|
||||||
@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
|
|||||||
@pytest.mark.parametrize("topk", [6])
|
@pytest.mark.parametrize("topk", [6])
|
||||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
@requires_deep_ep
|
@requires_deep_ep
|
||||||
def test_deep_ep_moe(
|
def test_deep_ep_moe(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False]
|
|||||||
@pytest.mark.parametrize("topk", [6])
|
@pytest.mark.parametrize("topk", [6])
|
||||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||||
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
|
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
@requires_deep_ep
|
@requires_deep_ep
|
||||||
def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
|
def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
|
||||||
num_experts: int, topk: int,
|
num_experts: int, topk: int,
|
||||||
|
|||||||
@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
|||||||
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||||
|
|
||||||
|
from ...utils import multi_gpu_test
|
||||||
from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
|
from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
|
||||||
reference_moe_impl,
|
reference_moe_impl,
|
||||||
run_modular_kernel)
|
run_modular_kernel)
|
||||||
@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool:
|
|||||||
product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
|
product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
|
||||||
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
|
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
|
||||||
@pytest.mark.parametrize("world_size", [2])
|
@pytest.mark.parametrize("world_size", [2])
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
@meets_multi_gpu_requirements
|
@meets_multi_gpu_requirements
|
||||||
def test_modular_kernel_combinations_multigpu(
|
def test_modular_kernel_combinations_multigpu(
|
||||||
k: int, n: int, e: int, dtype: torch.dtype,
|
k: int, n: int, e: int, dtype: torch.dtype,
|
||||||
|
|||||||
@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import cdiv
|
from vllm.utils import cdiv
|
||||||
|
|
||||||
|
from ...utils import multi_gpu_test
|
||||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -247,6 +248,7 @@ def _pplx_moe(
|
|||||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]])
|
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]])
|
||||||
@pytest.mark.parametrize("use_internode", [False])
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
|
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
|
||||||
current_platform.get_device_capability()),
|
current_platform.get_device_capability()),
|
||||||
|
|||||||
@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import round_up
|
from vllm.utils import round_up
|
||||||
|
|
||||||
|
from ...utils import multi_gpu_test
|
||||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||||
|
|
||||||
requires_pplx = pytest.mark.skipif(
|
requires_pplx = pytest.mark.skipif(
|
||||||
@ -452,6 +453,7 @@ def _pplx_prepare_finalize(
|
|||||||
@pytest.mark.parametrize("use_internode", [False])
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
@pytest.mark.optional
|
@pytest.mark.optional
|
||||||
@requires_pplx
|
@requires_pplx
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
def test_pplx_prepare_finalize_slow(
|
def test_pplx_prepare_finalize_slow(
|
||||||
mnk: tuple[int, int, int],
|
mnk: tuple[int, int, int],
|
||||||
e: int,
|
e: int,
|
||||||
@ -740,6 +742,7 @@ def _pplx_moe(
|
|||||||
@pytest.mark.parametrize("use_internode", [False])
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
@pytest.mark.optional
|
@pytest.mark.optional
|
||||||
@requires_pplx
|
@requires_pplx
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
def test_pplx_moe_slow(
|
def test_pplx_moe_slow(
|
||||||
mnk: tuple[int, int, int],
|
mnk: tuple[int, int, int],
|
||||||
e: int,
|
e: int,
|
||||||
@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
|
|||||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||||
@pytest.mark.parametrize("use_internode", [False])
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
@requires_pplx
|
@requires_pplx
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
def test_pplx_prepare_finalize(
|
def test_pplx_prepare_finalize(
|
||||||
world_dp_size: tuple[int, int],
|
world_dp_size: tuple[int, int],
|
||||||
use_internode: bool,
|
use_internode: bool,
|
||||||
@ -893,6 +897,7 @@ def test_pplx_prepare_finalize(
|
|||||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||||
@pytest.mark.parametrize("use_internode", [False])
|
@pytest.mark.parametrize("use_internode", [False])
|
||||||
@requires_pplx
|
@requires_pplx
|
||||||
|
@multi_gpu_test(num_gpus=2)
|
||||||
def test_pplx_moe(
|
def test_pplx_moe(
|
||||||
world_dp_size: tuple[int, int],
|
world_dp_size: tuple[int, int],
|
||||||
use_internode: bool,
|
use_internode: bool,
|
||||||
|
|||||||
@ -696,9 +696,12 @@ def multi_process_parallel(
|
|||||||
os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
|
os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
|
||||||
ray.init(
|
ray.init(
|
||||||
runtime_env={
|
runtime_env={
|
||||||
"working_dir": VLLM_PATH,
|
"working_dir":
|
||||||
"excludes":
|
VLLM_PATH,
|
||||||
["build", ".git", "cmake-build-*", "shellcheck", "dist"]
|
"excludes": [
|
||||||
|
"build", ".git", "cmake-build-*", "shellcheck", "dist",
|
||||||
|
"ep_kernels_workspace"
|
||||||
|
]
|
||||||
})
|
})
|
||||||
|
|
||||||
distributed_init_port = get_open_port()
|
distributed_init_port = get_open_port()
|
||||||
|
|||||||
@ -77,6 +77,7 @@ clone_repo() {
|
|||||||
local repo_url=$1
|
local repo_url=$1
|
||||||
local dir_name=$2
|
local dir_name=$2
|
||||||
local key_file=$3
|
local key_file=$3
|
||||||
|
local commit_hash=$4
|
||||||
|
|
||||||
if [ -d "$dir_name" ]; then
|
if [ -d "$dir_name" ]; then
|
||||||
# Check if directory has uncommitted changes (dirty)
|
# Check if directory has uncommitted changes (dirty)
|
||||||
@ -87,17 +88,27 @@ clone_repo() {
|
|||||||
echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
|
echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
|
||||||
rm -rf "$dir_name"
|
rm -rf "$dir_name"
|
||||||
git clone "$repo_url"
|
git clone "$repo_url"
|
||||||
|
if [ -n "$commit_hash" ]; then
|
||||||
|
cd "$dir_name"
|
||||||
|
git checkout "$commit_hash"
|
||||||
|
cd ..
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "$dir_name directory exists and appears complete; manually update if needed"
|
echo "$dir_name directory exists and appears complete; manually update if needed"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
git clone "$repo_url"
|
git clone "$repo_url"
|
||||||
|
if [ -n "$commit_hash" ]; then
|
||||||
|
cd "$dir_name"
|
||||||
|
git checkout "$commit_hash"
|
||||||
|
cd ..
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# build and install pplx, require pytorch installed
|
# build and install pplx, require pytorch installed
|
||||||
pushd $WORKSPACE
|
pushd $WORKSPACE
|
||||||
clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
|
clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
|
||||||
cd pplx-kernels
|
cd pplx-kernels
|
||||||
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
|
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
|
||||||
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
|
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
|
||||||
@ -106,7 +117,7 @@ popd
|
|||||||
|
|
||||||
# build and install deepep, require pytorch installed
|
# build and install deepep, require pytorch installed
|
||||||
pushd $WORKSPACE
|
pushd $WORKSPACE
|
||||||
clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py"
|
clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf"
|
||||||
cd DeepEP
|
cd DeepEP
|
||||||
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
|
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
|
||||||
PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e .
|
PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e .
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user