fix ci issue distributed 4 gpu test (#20204)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-06-05 23:29:09 +08:00 · 2025-06-28 01:50:00 -04:00 · 2025-06-28 01:50:00 -04:00 · d45417b804
commit d45417b804
parent a29e62ea34
1 changed files with 18 additions and 0 deletions
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -64,6 +64,18 @@ def parse_args():
    parser.add_argument(
        "--trust-remote-code", action="store_true", help="Trust remote code."
    )
+    parser.add_argument(
+        "--max-num-seqs",
+        type=int,
+        default=64,
+        help=("Maximum number of sequences to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.8,
+        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
+    )
    return parser.parse_args()


@ -77,6 +89,8 @@ def main(
    GPUs_per_dp_rank,
    enforce_eager,
    trust_remote_code,
+    max_num_seqs,
+    gpu_memory_utilization,
 ):
    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@ -127,6 +141,8 @@ def main(
        enforce_eager=enforce_eager,
        enable_expert_parallel=True,
        trust_remote_code=trust_remote_code,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
    )
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
@ -181,6 +197,8 @@ if __name__ == "__main__":
                tp_size,
                args.enforce_eager,
                args.trust_remote_code,
+                args.max_num_seqs,
+                args.gpu_memory_utilization,
            ),
        )
        proc.start()