diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/generation/test_batch_invariance.py index db1c757521f00..31f6f377da624 100644 --- a/tests/v1/generation/test_batch_invariance.py +++ b/tests/v1/generation/test_batch_invariance.py @@ -292,8 +292,11 @@ def LLM_with_max_seqs( # Allow some CPU offload if needed. swap_space=swap_space, # Keep things lean and CI-friendly. - dtype="float16", + dtype="auto", # Single-GPU by default; override externally if desired. tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), trust_remote_code=os.getenv("VLLM_TRUST_REMOTE_CODE", "0") == "1", + enable_prefix_caching=False, + # Enable for MOE models + # enable_expert_parallel=True, )