diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 2b3d8f94dbc13..23a46b6ed8ad8 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -246,52 +246,6 @@ __global__ void reshape_and_cache_flash_kernel(
   }
 }
 
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void reshape_and_cache_flash_full_cuda_kernel(
-    const int32_t* __restrict__ tensorshape,
-    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
-    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
-    cache_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads,
-                                         // head_size]
-    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
-                                         // head_size]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride, const int key_stride, const int value_stride,
-    const int num_heads, const int head_size, const int block_size,
-    const float k_scale, const float v_scale) {
-  const int64_t token_idx = blockIdx.x;
-
-  int32_t unpadded_num_tokens = tensorshape[0];
-  if (token_idx >= unpadded_num_tokens) {
-    return;
-  }
-
-  const int64_t slot_idx = slot_mapping[token_idx];
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-  const int n = num_heads * head_size;
-  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    const int64_t src_key_idx = token_idx * key_stride + i;
-    const int64_t src_value_idx = token_idx * value_stride + i;
-    const int head_idx = i / head_size;
-    const int head_offset = i % head_size;
-    const int64_t tgt_key_value_idx = block_idx * block_stride +
-                                      block_offset * num_heads * head_size +
-                                      head_idx * head_size + head_offset;
-    scalar_t tgt_key = key[src_key_idx];
-    scalar_t tgt_value = value[src_value_idx];
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      key_cache[tgt_key_value_idx] = tgt_key;
-      value_cache[tgt_key_value_idx] = tgt_value;
-    } else {
-      key_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
-      value_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
-    }
-  }
-}
-
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void concat_and_cache_mla_kernel(
     const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
@@ -434,6 +388,9 @@ void reshape_and_cache_flash(
                              CALL_RESHAPE_AND_CACHE_FLASH);
 }
 
+// KV_T is the stored data type of kv-cache.
+// CACHE_T is the data type of key and value tensors.
+// KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
   vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
       <<<grid, block, 0, stream>>>(                                    \
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a2ad5c60e70b5..979890170c16b 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -633,7 +633,6 @@ class VllmBackend:
 
         if not self.compilation_config.use_cudagraph or \
             not self.compilation_config.cudagraph_copy_inputs:
-#            return self.graph
             return self.split_gm
 
         # if we need to copy input buffers for cudagraph
diff --git a/vllm/config.py b/vllm/config.py
index 817f46e2a5fb0..c0664992b4573 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3245,8 +3245,7 @@ class VllmConfig:
             # and avoid any potential issues with the inductor.
             self.compilation_config.custom_ops = ["none"]
             self.compilation_config.use_cudagraph = True
-#            self.compilation_config.use_inductor = True
-            self.compilation_config.use_inductor = False
+            self.compilation_config.use_inductor = True
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_reshape = False
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index df153fb51c05f..8bc0da8e3a953 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -460,9 +460,14 @@ class GPUModelRunner:
             self.slot_mapping_cpu[:total_num_scheduled_tokens],
             non_blocking=True)
 
+        # TODO: Which of these are needed?
+        self.seq_lens[num_reqs:].fill_(0)
         self.query_start_loc[num_reqs + 1:].fill_(-1)
         self.positions[total_num_scheduled_tokens:].fill_(0)
-        self.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
+        self.input_batch.block_table.get_device_tensor()[num_reqs:].fill_(-1)
+
+        # Fill with -1s -- needed for reshape_and_cache
+        self.slot_mapping[total_num_scheduled_tokens:].fill_(-1) # Definitely needed
 
         # Prepare for cascade attention if needed.
         common_prefix_len = (scheduler_output.num_common_prefix_blocks *