diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5309581d8e81..71249a9543c7 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -550,6 +550,26 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s tests/kernels/moe/test_deepgemm.py
+    - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
   torch_nightly: true
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index e9973c1fcc15..d0e4f6554a91 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -22,6 +22,7 @@ from vllm.utils.deep_gemm import (
     fp8_gemm_nt,
     get_col_major_tma_aligned_tensor,
     per_block_cast_to_fp8,
+    should_use_deepgemm_for_fp8_linear,
 )
 from vllm.utils.import_utils import has_deep_gemm
 
@@ -157,10 +158,6 @@ def test_w8a8_block_fp8_cutlass_matmul():
 @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
-    # only aligned sizes
-    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
-        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
-
     torch.manual_seed(seed)
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max = fp8_info.max
@@ -168,6 +165,12 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
 
+    # only aligned sizes are supported by deepgemm
+    if not should_use_deepgemm_for_fp8_linear(
+        output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True
+    ):
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
     A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
     B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b5ab37534dd7..6b0a383a0e28 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -365,11 +365,18 @@ def should_use_deepgemm_for_fp8_linear(
 ):
     if supports_deep_gemm is None:
         supports_deep_gemm = is_deep_gemm_supported()
+
+    # Verify DeepGEMM N/K dims requirements
+    # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
+    # test inside kernels/quatization/test_block_fp8.py
+    N_MULTIPLE = 64
+    K_MULTIPLE = 128
+
     return (
         supports_deep_gemm
         and output_dtype == torch.bfloat16
-        and weight.shape[0] % 128 == 0
-        and weight.shape[1] % 128 == 0
+        and weight.shape[0] % N_MULTIPLE == 0
+        and weight.shape[1] % K_MULTIPLE == 0
     )