Attempt to fix GPU OOM in a spec-decoding test (#29419)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
2025-12-09 04:45:01 +08:00 · 2025-11-25 20:23:36 +01:00 · 2025-11-25 20:23:36 +01:00 · c32a18cbe7
commit c32a18cbe7
parent b07555d26f
1 changed files with 1 additions and 1 deletions
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -133,7 +133,7 @@ def main(args):
        tensor_parallel_size=args.tp,
        enable_chunked_prefill=args.enable_chunked_prefill,
        enforce_eager=args.enforce_eager,
-        gpu_memory_utilization=0.8,
+        gpu_memory_utilization=0.9,
        speculative_config=speculative_config,
        disable_log_stats=False,
        max_model_len=args.max_model_len,