diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py index b5c16c42de467..7473a41e51406 100755 --- a/benchmarks/benchmark_batch_invariance.py +++ b/benchmarks/benchmark_batch_invariance.py @@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant( random.seed(seed) # Set environment variables - os.environ["VLLM_ATTENTION_BACKEND"] = backend if batch_invariant: os.environ["VLLM_BATCH_INVARIANT"] = "1" else: @@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant( max_model_len=max_model_len, dtype="bfloat16", tensor_parallel_size=tp_size, + attention_config={"backend": backend}, enable_prefix_caching=False, ) init_time = time.perf_counter() - start_init diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 28ab2cee71a6a..f8a629ed46cee 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -557,7 +557,8 @@ def test_rms_group_quant( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties