vllm/tests/basic_correctness/test_cpu_offload.py

from vllm.utils import is_hip

from ..utils import compare_two_settings


def test_cpu_offload():
    compare_two_settings("meta-llama/Llama-2-7b-hf", [],
                         ["--cpu-offload-gb", "4"])
    if not is_hip():
        # compressed-tensors quantization is currently not supported in ROCm.
        compare_two_settings(
            "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", [],
            ["--cpu-offload-gb", "1"])