diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 771756a42f402..d11a43377548c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -835,11 +835,11 @@ steps:
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
-- label: GPT-OSS Eval (Blackwell)
+- label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
   gpu: b200
-  optional: true # disable while debugging
+  optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
   - vllm/model_executor/models/gpt_oss.py
@@ -866,6 +866,16 @@ steps:
   commands:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
 
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 75
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt
new file mode 100644
index 0000000000000..e577645d60d6f
--- /dev/null
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@@ -0,0 +1,4 @@
+Qwen3-0.6B-FP8.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-CT.yaml
+DeepSeek-V2-Lite-Instruct-FP8.yaml