diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98ed682fee7d9..529ce29029b27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -635,7 +635,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                      "in CUDA target architectures.")
     endif()
   endif()
-  
+
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
@@ -842,8 +842,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       "csrc/moe/moe_permute_unpermute_op.cu")
 
   set_gencode_flags_for_srcs(
-    SRCS "${MARLIN_PERMUTE_SRC}"
-    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
+    SRCS "${MOE_PERMUTE_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
 
   list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
 endif()
diff --git a/tests/kernels/test_shuffle_rows.py b/tests/kernels/test_shuffle_rows.py
new file mode 100644
index 0000000000000..7d02e1764e7d4
--- /dev/null
+++ b/tests/kernels/test_shuffle_rows.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the shuffle_rows function
+
+Run `pytest tests/kernels/test_shuffle_rows.py`.
+"""
+
+import pytest
+import torch
+
+from vllm._custom_ops import shuffle_rows
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize("num_tokens", [1, 16, 64, 128, 256, 512, 1024])
+@pytest.mark.parametrize("hidden_size", [128, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("dtype",
+                         [torch.float16, torch.bfloat16, torch.float32])
+def test_shuffle_rows_basic(num_tokens: int, hidden_size: int,
+                            dtype: torch.dtype):
+    """Test basic functionality of shuffle_rows with various tensor sizes and
+    dtypes."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    # Create input tensor
+    input_tensor = torch.randn(num_tokens,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+
+    # Create a simple permutation map (identity mapping)
+    dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # With identity mapping, output should be identical to input
+    torch.testing.assert_close(output, input_tensor, atol=0, rtol=0)
+
+    # Check output shape
+    assert output.shape == (num_tokens, hidden_size)
+    assert output.dtype == dtype
+    assert output.device == input_tensor.device
+
+
+@pytest.mark.parametrize("num_tokens", [16, 64, 128])
+@pytest.mark.parametrize("hidden_size", [128, 512, 1024])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_shuffle_rows_permutation(num_tokens: int, hidden_size: int,
+                                  dtype: torch.dtype):
+    """Test shuffle_rows with actual permutation."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    # Create input tensor
+    input_tensor = torch.randn(num_tokens,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+
+    # Create a reverse permutation map
+    dst2src_map = torch.arange(num_tokens - 1,
+                               -1,
+                               -1,
+                               device="cuda",
+                               dtype=torch.int32)
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # Check that the output is the reverse of the input
+    expected_output = torch.flip(input_tensor, dims=[0])
+    torch.testing.assert_close(output, expected_output, atol=1e-6, rtol=1e-5)
+
+    # Check output shape and properties
+    assert output.shape == (num_tokens, hidden_size)
+    assert output.dtype == dtype
+    assert output.device == input_tensor.device
+
+
+@pytest.mark.parametrize("num_tokens", [32, 64])
+@pytest.mark.parametrize("hidden_size", [256, 512])
+def test_shuffle_rows_expansion(num_tokens: int, hidden_size: int):
+    """Test shuffle_rows with expansion (more output tokens than input
+    tokens)."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    dtype = torch.float16
+
+    # Create input tensor
+    input_tensor = torch.randn(num_tokens,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+
+    # Create a mapping that duplicates some tokens (expansion)
+    expanded_size = num_tokens * 2
+    dst2src_map = torch.randint(0,
+                                num_tokens, (expanded_size, ),
+                                device="cuda",
+                                dtype=torch.int32)
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # Check output shape
+    assert output.shape == (expanded_size, hidden_size)
+    assert output.dtype == dtype
+    assert output.device == input_tensor.device
+
+    # Verify that each output row matches the corresponding input row
+    for i in range(expanded_size):
+        src_idx = dst2src_map[i].item()
+        torch.testing.assert_close(output[i],
+                                   input_tensor[src_idx],
+                                   atol=1e-6,
+                                   rtol=1e-5)
+
+
+@pytest.mark.parametrize("num_tokens", [16, 64])
+@pytest.mark.parametrize("hidden_size", [128, 512])
+def test_shuffle_rows_random_permutation(num_tokens: int, hidden_size: int):
+    """Test shuffle_rows with random permutation."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    dtype = torch.float16
+
+    # Set seed for reproducibility
+    torch.manual_seed(42)
+
+    # Create input tensor
+    input_tensor = torch.randn(num_tokens,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+
+    # Create a random permutation map
+    dst2src_map = torch.randperm(num_tokens, device="cuda", dtype=torch.int32)
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # Check output shape and properties
+    assert output.shape == (num_tokens, hidden_size)
+    assert output.dtype == dtype
+    assert output.device == input_tensor.device
+
+    # Verify that each output row matches the corresponding input row
+    for i in range(num_tokens):
+        src_idx = dst2src_map[i].item()
+        torch.testing.assert_close(output[i],
+                                   input_tensor[src_idx],
+                                   atol=1e-6,
+                                   rtol=1e-5)
+
+
+def test_shuffle_rows_edge_cases():
+    """Test shuffle_rows with edge cases."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    dtype = torch.float16
+
+    # Test with single token
+    input_tensor = torch.randn(1, 128, device="cuda", dtype=dtype)
+    dst2src_map = torch.tensor([0], device="cuda", dtype=torch.int32)
+    output = shuffle_rows(input_tensor, dst2src_map)
+    torch.testing.assert_close(output, input_tensor, atol=0, rtol=0)
+
+    # Test with single feature dimension
+    input_tensor = torch.randn(16, 1, device="cuda", dtype=dtype)
+    dst2src_map = torch.arange(16, device="cuda", dtype=torch.int32)
+    output = shuffle_rows(input_tensor, dst2src_map)
+    torch.testing.assert_close(output, input_tensor, atol=0, rtol=0)
+
+
+def test_shuffle_rows_moe_like_scenario():
+    """Test shuffle_rows in a scenario similar to MoE usage."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    dtype = torch.float16
+    batch_size = 32
+    hidden_size = 1024
+    topk = 2
+
+    # Simulate input tokens
+    input_tensor = torch.randn(batch_size,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+
+    # Simulate expert assignment (each token goes to topk experts)
+    # This creates a mapping where tokens are duplicated for multiple experts
+    total_tokens = batch_size * topk
+    dst2src_map = torch.zeros(total_tokens, device="cuda", dtype=torch.int32)
+
+    # Fill the mapping to simulate MoE token distribution
+    for i in range(batch_size):
+        for k in range(topk):
+            dst2src_map[i * topk + k] = i
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # Check output shape
+    assert output.shape == (total_tokens, hidden_size)
+    assert output.dtype == dtype
+    assert output.device == input_tensor.device
+
+    # Verify that tokens are correctly duplicated
+    for i in range(batch_size):
+        for k in range(topk):
+            output_idx = i * topk + k
+            torch.testing.assert_close(output[output_idx],
+                                       input_tensor[i],
+                                       atol=1e-6,
+                                       rtol=1e-5)
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float16, torch.bfloat16, torch.float32])
+def test_shuffle_rows_dtype_consistency(dtype: torch.dtype):
+    """Test that shuffle_rows preserves dtype correctly."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    num_tokens = 64
+    hidden_size = 512
+
+    # Create input tensor with specific dtype
+    input_tensor = torch.randn(num_tokens,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+    dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # Verify dtype is preserved
+    assert output.dtype == dtype
+    assert output.device == input_tensor.device
+    torch.testing.assert_close(output, input_tensor, atol=1e-6, rtol=1e-5)
+
+
+def test_shuffle_rows_device_consistency():
+    """Test that shuffle_rows maintains device consistency."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    num_tokens = 32
+    hidden_size = 256
+    dtype = torch.float16
+
+    # Create input tensor on CUDA
+    input_tensor = torch.randn(num_tokens,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+    dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # Verify device is maintained
+    assert output.device == input_tensor.device
+    assert output.device.type == "cuda"
+
+
+def test_shuffle_rows_contiguous_output():
+    """Test that shuffle_rows produces contiguous output."""
+    if not current_platform.is_cuda():
+        pytest.skip("shuffle_rows requires CUDA")
+
+    num_tokens = 64
+    hidden_size = 512
+    dtype = torch.float16
+
+    # Create input tensor
+    input_tensor = torch.randn(num_tokens,
+                               hidden_size,
+                               device="cuda",
+                               dtype=dtype)
+    dst2src_map = torch.arange(num_tokens, device="cuda", dtype=torch.int32)
+
+    # Test shuffle_rows
+    output = shuffle_rows(input_tensor, dst2src_map)
+
+    # Verify output is contiguous
+    assert output.is_contiguous()