[Build/CI] Extending the set of AMD tests with Regression, Basic Correctness, Distributed, Engine, Llava Tests (#4797)

2025-12-13 15:05:45 +08:00 · 2024-05-16 22:58:25 -05:00 · 2024-05-16 22:58:25 -05:00 · 26148120b3
commit 26148120b3
parent 0150a10630
5 changed files with 28 additions and 20 deletions
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -1,4 +1,4 @@
-# This script build the ROCm docker image and runs test inside it.
+# This script runs test inside the corresponding ROCm docker container.
 set -ex
 # Print ROCm version
@ -19,15 +19,16 @@ done
 echo "--- Building container"
 sha=$(git rev-parse --short HEAD)
-container_name=rocm_${sha}
+image_name=rocm_${sha}
 container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
 docker build \
-        -t ${container_name} \
+        -t ${image_name} \
        -f Dockerfile.rocm \
        --progress plain \
        .
 remove_docker_container() {
-   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
+   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
 }
 trap remove_docker_container EXIT
@ -39,6 +40,6 @@ docker run \
        --rm \
        -e HF_TOKEN \
        --name ${container_name} \
-        ${container_name} \
+        ${image_name} \
        /bin/bash -c "${@}"
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -5,13 +5,16 @@
 steps:
 - label: Regression Test
  mirror_hardwares: [amd]
  command: pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
 - label: AsyncEngine Test
  #mirror_hardwares: [amd]
  command: pytest -v -s async_engine
 - label: Basic Correctness Test
  mirror_hardwares: [amd]
  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
@ -24,14 +27,15 @@ steps:
  command: pytest -v -s core
 - label: Distributed Comm Ops Test
  #mirror_hardwares: [amd]
  command: pytest -v -s distributed/test_comm_ops.py
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
 - label: Distributed Tests
  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  mirror_hardwares: [amd]
  commands:
  - pytest -v -s distributed/test_pynccl_library.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@ -45,16 +49,18 @@ steps:
  - pytest -v -s spec_decode/e2e/test_integration_dist.py 
 - label: Distributed Tests (Multiple Groups)
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  commands:
  - pytest -v -s distributed/test_pynccl.py
 - label: Engine Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 - label: Entrypoints Test
  #mirror_hardwares: [amd]
  commands:
  # these tests have to be separated, because each one will allocate all posible GPU memory
  - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
@ -74,6 +80,7 @@ steps:
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 - label: Kernels Test %N
  #mirror_hardwares: [amd]
  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4
@ -84,7 +91,7 @@ steps:
    - pytest -v -s models --ignore=models/test_llava.py
 - label: Llava Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s models/test_llava.py
@ -95,6 +102,7 @@ steps:
    - pytest -v -s prefix_caching
 - label: Samplers Test
  #mirror_hardwares: [amd]
  command: pytest -v -s samplers
 - label: LogitsProcessor Test
@ -110,16 +118,20 @@ steps:
  command: pytest -v -s spec_decode
 - label: LoRA Test %N
  #mirror_hardwares: [amd]
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4
 - label: Tensorizer Test
  #mirror_hardwares: [amd]
  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
 - label: Metrics Test
  mirror_hardwares: [amd]
  command: pytest -v -s metrics
 - label: Quantization Test
  #mirror_hardwares: [amd]
  command: pytest -v -s quantization
 - label: Benchmarks
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -3,9 +3,8 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 steps:
  - label: ":docker: build image"
-    commands:
+    commands: 
      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
      - "docker push {{ docker_image }}"
    env:
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@ -32,6 +32,7 @@ def test_stop_reason(vllm_model, example_prompts):
    # test stop token
    outputs = llm.generate(example_prompts,
                           sampling_params=SamplingParams(
                               ignore_eos=True,
                               seed=SEED,
                               max_tokens=MAX_TOKENS,
                               stop_token_ids=[stop_token_id]))
@ -43,7 +44,10 @@ def test_stop_reason(vllm_model, example_prompts):
    # test stop string
    outputs = llm.generate(example_prompts,
                           sampling_params=SamplingParams(
-                               seed=SEED, max_tokens=MAX_TOKENS, stop="."))
+                               ignore_eos=True,
                               seed=SEED,
                               max_tokens=MAX_TOKENS,
                               stop="."))
    for output in outputs:
        output = output.outputs[0]
        assert output.finish_reason == "stop"
--- a/vllm/config.py
+++ b/vllm/config.py
@ -1060,7 +1060,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
    "bfloat16": torch.bfloat16,
 }
-_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
+_ROCM_NOT_SUPPORTED_DTYPE: List[str] = []  #
 def _get_and_verify_dtype(
@ -1092,14 +1092,6 @@ def _get_and_verify_dtype(
    else:
        raise ValueError(f"Unknown dtype: {dtype}")
    if is_hip() and torch_dtype == torch.float32:
        rocm_supported_dtypes = [
            k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
            if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
        ]
        raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
                         f"Supported dtypes are {rocm_supported_dtypes}")
    # Verify the dtype.
    if torch_dtype != config_dtype:
        if torch_dtype == torch.float32: