From 40b86aa05e4458bf28f038666942d620d89c8c3d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 6 Jul 2025 00:17:30 -0400
Subject: [PATCH] [BugFix] Fix: ImportError when building on hopper systems
 (#20513)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 .github/CODEOWNERS                                       | 2 +-
 csrc/ops.h                                               | 5 -----
 .../cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu  | 9 ++++++++-
 csrc/torch_bindings.cpp                                  | 3 +--
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index da7f89747a16d..2acb03d52a67c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
diff --git a/csrc/ops.h b/csrc/ops.h
index 56e51cc659d86..52c264d64ccad 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -239,11 +239,6 @@ void cutlass_moe_mm(
     torch::Tensor const& b_strides, torch::Tensor const& c_strides,
     bool per_act_token, bool per_out_ch);
 
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets);
-
 void cutlass_fp4_group_mm(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
index ef57e503b21ae..236d76ed52081 100644
--- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@@ -1,3 +1,5 @@
+#include "core/registration.h"
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
 
@@ -364,4 +366,9 @@ void cutlass_blockwise_scaled_grouped_mm(
     TORCH_CHECK(false, "Unsupported output tensor type");
   }
 #endif
-}
\ No newline at end of file
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_blockwise_scaled_grouped_mm",
+         &cutlass_blockwise_scaled_grouped_mm);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 04329e75db8c3..9414e26196b28 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -399,8 +399,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor scales_a, Tensor scales_b, "
       "Tensor problem_sizes, Tensor expert_offsets) -> ()",
       {stride_tag});
-  ops.impl("cutlass_blockwise_scaled_grouped_mm", torch::kCUDA,
-           &cutlass_blockwise_scaled_grouped_mm);
+  // conditionally compiled so impl registration is in source file
 
   // cutlass nvfp4 block scaled group GEMM
   ops.def(