vllm/benchmarks/benchmark_hash.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.

This focuses on a single test payload shaped like the prefix-cache hash input:
    (32-byte bytes object, 32-int tuple)

Usage:
    python benchmarks/hash_micro_benchmark.py --iterations 20000
"""

from __future__ import annotations

import argparse
import random
import statistics
import time
from collections.abc import Callable, Iterable

from vllm.utils.hashing import sha256, xxhash


def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
    """Generate a deterministic test payload."""
    random.seed(seed)
    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
    return (bytes_data, int_tuple)


def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
    times: list[float] = []

    # Warm-up to avoid first-run noise.
    for _ in range(200):
        func(data)

    for _ in range(iterations):
        start = time.perf_counter()
        func(data)
        end = time.perf_counter()
        times.append(end - start)

    avg = statistics.mean(times)
    std = statistics.stdev(times) if len(times) > 1 else 0.0
    return avg, std


def _run_benchmarks(
    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
    data: tuple,
    iterations: int,
):
    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
    for name, func in benchmarks:
        try:
            avg, std = _benchmark_func(func, data, iterations)
        except ModuleNotFoundError as exc:
            print(f"Skipping {name}: {exc}")
            continue
        yield name, avg, std


def builtin_hash(data: tuple) -> int:
    """Wrapper for Python's built-in hash()."""
    return hash(data)


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--iterations",
        type=int,
        default=10_000,
        help="Number of measured iterations per hash function.",
    )
    parser.add_argument(
        "--seed", type=int, default=42, help="Random seed for test payload."
    )
    args = parser.parse_args()

    data = _generate_test_data(args.seed)
    benchmarks = (
        ("SHA256 (pickle)", sha256),
        ("xxHash (pickle)", xxhash),
        ("built-in hash()", builtin_hash),
    )

    print("=" * 60)
    print("HASH FUNCTION MICRO BENCHMARK")
    print("=" * 60)
    print("Test data: (32-byte bytes object, 32-int tuple)")
    print(f"Iterations: {args.iterations:,}")
    print("=" * 60)

    results = list(_run_benchmarks(benchmarks, data, args.iterations))
    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)

    print("\nResults:")
    for name, avg, std in results:
        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")

    if builtin_entry:
        _, builtin_avg, _ = builtin_entry
        print("\n" + "=" * 60)
        print("SUMMARY (relative to built-in hash())")
        print("=" * 60)
        for name, avg, _ in results:
            if name == "built-in hash()":
                continue
            speed_ratio = avg / builtin_avg
            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
    else:
        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")


if __name__ == "__main__":
    main()