From a65a934ebeeddd8ecfc8034d784f3e284a365971 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Sun, 9 Nov 2025 13:08:38 -0800
Subject: [PATCH] [CI/Build] Temporary fix to LM Eval Small Models (#28324)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 .buildkite/test-pipeline.yaml                       | 2 +-
 tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml | 5 ++++-
 tests/evals/gsm8k/test_gsm8k_correctness.py         | 4 +++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a4436bc2ac222..b81c090fa4710 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1253,7 +1253,7 @@ steps:
     - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
-    - pytest -v -s tests/v1/distributed/test_dbo.py  
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
index c5dbceeeb2b45..ea9c95158405a 100644
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -2,4 +2,7 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+max_model_len: 4096
+# Duo stream incompatabilbe with this model: https://github.com/vllm-project/vllm/issues/28220
+env:
+  VLLM_DISABLE_SHARED_EXPERTS_STREAM: "1"
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
index ce3ab8096b45c..b5d67df7bf3db 100644
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -62,9 +62,11 @@ def test_gsm8k_correctness_param(config_filename, tp_size):
         str(tp_size),
     ]
 
+    env_dict = eval_config.get("env", None)
+
     # Launch server and run evaluation
     with RemoteOpenAIServer(
-        eval_config["model_name"], server_args, max_wait_seconds=480
+        eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
     ) as remote_server:
         server_url = remote_server.url_for("v1")