diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 2b3d8f94dbc13..23a46b6ed8ad8 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -246,52 +246,6 @@ __global__ void reshape_and_cache_flash_kernel( } } -template -__global__ void reshape_and_cache_flash_full_cuda_kernel( - const int32_t* __restrict__ tensorshape, - const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] - const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads, - // head_size] - cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads, - // head_size] - const int64_t* __restrict__ slot_mapping, // [num_tokens] - const int block_stride, const int key_stride, const int value_stride, - const int num_heads, const int head_size, const int block_size, - const float k_scale, const float v_scale) { - const int64_t token_idx = blockIdx.x; - - int32_t unpadded_num_tokens = tensorshape[0]; - if (token_idx >= unpadded_num_tokens) { - return; - } - - const int64_t slot_idx = slot_mapping[token_idx]; - const int64_t block_idx = slot_idx / block_size; - const int64_t block_offset = slot_idx % block_size; - const int n = num_heads * head_size; - for (int i = threadIdx.x; i < n; i += blockDim.x) { - const int64_t src_key_idx = token_idx * key_stride + i; - const int64_t src_value_idx = token_idx * value_stride + i; - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int64_t tgt_key_value_idx = block_idx * block_stride + - block_offset * num_heads * head_size + - head_idx * head_size + head_offset; - scalar_t tgt_key = key[src_key_idx]; - scalar_t tgt_value = value[src_value_idx]; - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - key_cache[tgt_key_value_idx] = tgt_key; - value_cache[tgt_key_value_idx] = tgt_value; - } else { - key_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_key, k_scale); - value_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_value, v_scale); - } - } -} - template __global__ void concat_and_cache_mla_kernel( const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank] @@ -434,6 +388,9 @@ void reshape_and_cache_flash( CALL_RESHAPE_AND_CACHE_FLASH); } +// KV_T is the stored data type of kv-cache. +// CACHE_T is the data type of key and value tensors. +// KV_DTYPE is the real data type of kv-cache. #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ vllm::concat_and_cache_mla_kernel \ <<>>( \ diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index a2ad5c60e70b5..979890170c16b 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -633,7 +633,6 @@ class VllmBackend: if not self.compilation_config.use_cudagraph or \ not self.compilation_config.cudagraph_copy_inputs: -# return self.graph return self.split_gm # if we need to copy input buffers for cudagraph diff --git a/vllm/config.py b/vllm/config.py index 817f46e2a5fb0..c0664992b4573 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3245,8 +3245,7 @@ class VllmConfig: # and avoid any potential issues with the inductor. self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True -# self.compilation_config.use_inductor = True - self.compilation_config.use_inductor = False + self.compilation_config.use_inductor = True self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_reshape = False diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index df153fb51c05f..8bc0da8e3a953 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -460,9 +460,14 @@ class GPUModelRunner: self.slot_mapping_cpu[:total_num_scheduled_tokens], non_blocking=True) + # TODO: Which of these are needed? + self.seq_lens[num_reqs:].fill_(0) self.query_start_loc[num_reqs + 1:].fill_(-1) self.positions[total_num_scheduled_tokens:].fill_(0) - self.slot_mapping[total_num_scheduled_tokens:].fill_(-1) + self.input_batch.block_table.get_device_tensor()[num_reqs:].fill_(-1) + + # Fill with -1s -- needed for reshape_and_cache + self.slot_mapping[total_num_scheduled_tokens:].fill_(-1) # Definitely needed # Prepare for cascade attention if needed. common_prefix_len = (scheduler_output.num_common_prefix_blocks *