mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-21 20:07:11 +08:00
Remove unneeded changes
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
parent
c33aeecf24
commit
f33ec6d2d2
@ -246,52 +246,6 @@ __global__ void reshape_and_cache_flash_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
|
||||||
__global__ void reshape_and_cache_flash_full_cuda_kernel(
|
|
||||||
const int32_t* __restrict__ tensorshape,
|
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
|
||||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
|
||||||
cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads,
|
|
||||||
// head_size]
|
|
||||||
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
|
|
||||||
// head_size]
|
|
||||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
|
||||||
const int block_stride, const int key_stride, const int value_stride,
|
|
||||||
const int num_heads, const int head_size, const int block_size,
|
|
||||||
const float k_scale, const float v_scale) {
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
|
||||||
|
|
||||||
int32_t unpadded_num_tokens = tensorshape[0];
|
|
||||||
if (token_idx >= unpadded_num_tokens) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
|
||||||
const int64_t block_offset = slot_idx % block_size;
|
|
||||||
const int n = num_heads * head_size;
|
|
||||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
|
||||||
const int64_t src_key_idx = token_idx * key_stride + i;
|
|
||||||
const int64_t src_value_idx = token_idx * value_stride + i;
|
|
||||||
const int head_idx = i / head_size;
|
|
||||||
const int head_offset = i % head_size;
|
|
||||||
const int64_t tgt_key_value_idx = block_idx * block_stride +
|
|
||||||
block_offset * num_heads * head_size +
|
|
||||||
head_idx * head_size + head_offset;
|
|
||||||
scalar_t tgt_key = key[src_key_idx];
|
|
||||||
scalar_t tgt_value = value[src_value_idx];
|
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
key_cache[tgt_key_value_idx] = tgt_key;
|
|
||||||
value_cache[tgt_key_value_idx] = tgt_value;
|
|
||||||
} else {
|
|
||||||
key_cache[tgt_key_value_idx] =
|
|
||||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
|
|
||||||
value_cache[tgt_key_value_idx] =
|
|
||||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void concat_and_cache_mla_kernel(
|
__global__ void concat_and_cache_mla_kernel(
|
||||||
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
|
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
|
||||||
@ -434,6 +388,9 @@ void reshape_and_cache_flash(
|
|||||||
CALL_RESHAPE_AND_CACHE_FLASH);
|
CALL_RESHAPE_AND_CACHE_FLASH);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// KV_T is the stored data type of kv-cache.
|
||||||
|
// CACHE_T is the data type of key and value tensors.
|
||||||
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
<<<grid, block, 0, stream>>>( \
|
<<<grid, block, 0, stream>>>( \
|
||||||
|
|||||||
@ -633,7 +633,6 @@ class VllmBackend:
|
|||||||
|
|
||||||
if not self.compilation_config.use_cudagraph or \
|
if not self.compilation_config.use_cudagraph or \
|
||||||
not self.compilation_config.cudagraph_copy_inputs:
|
not self.compilation_config.cudagraph_copy_inputs:
|
||||||
# return self.graph
|
|
||||||
return self.split_gm
|
return self.split_gm
|
||||||
|
|
||||||
# if we need to copy input buffers for cudagraph
|
# if we need to copy input buffers for cudagraph
|
||||||
|
|||||||
@ -3245,8 +3245,7 @@ class VllmConfig:
|
|||||||
# and avoid any potential issues with the inductor.
|
# and avoid any potential issues with the inductor.
|
||||||
self.compilation_config.custom_ops = ["none"]
|
self.compilation_config.custom_ops = ["none"]
|
||||||
self.compilation_config.use_cudagraph = True
|
self.compilation_config.use_cudagraph = True
|
||||||
# self.compilation_config.use_inductor = True
|
self.compilation_config.use_inductor = True
|
||||||
self.compilation_config.use_inductor = False
|
|
||||||
self.compilation_config.cudagraph_num_of_warmups = 1
|
self.compilation_config.cudagraph_num_of_warmups = 1
|
||||||
self.compilation_config.pass_config.enable_fusion = False
|
self.compilation_config.pass_config.enable_fusion = False
|
||||||
self.compilation_config.pass_config.enable_reshape = False
|
self.compilation_config.pass_config.enable_reshape = False
|
||||||
|
|||||||
@ -460,9 +460,14 @@ class GPUModelRunner:
|
|||||||
self.slot_mapping_cpu[:total_num_scheduled_tokens],
|
self.slot_mapping_cpu[:total_num_scheduled_tokens],
|
||||||
non_blocking=True)
|
non_blocking=True)
|
||||||
|
|
||||||
|
# TODO: Which of these are needed?
|
||||||
|
self.seq_lens[num_reqs:].fill_(0)
|
||||||
self.query_start_loc[num_reqs + 1:].fill_(-1)
|
self.query_start_loc[num_reqs + 1:].fill_(-1)
|
||||||
self.positions[total_num_scheduled_tokens:].fill_(0)
|
self.positions[total_num_scheduled_tokens:].fill_(0)
|
||||||
self.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
|
self.input_batch.block_table.get_device_tensor()[num_reqs:].fill_(-1)
|
||||||
|
|
||||||
|
# Fill with -1s -- needed for reshape_and_cache
|
||||||
|
self.slot_mapping[total_num_scheduled_tokens:].fill_(-1) # Definitely needed
|
||||||
|
|
||||||
# Prepare for cascade attention if needed.
|
# Prepare for cascade attention if needed.
|
||||||
common_prefix_len = (scheduler_output.num_common_prefix_blocks *
|
common_prefix_len = (scheduler_output.num_common_prefix_blocks *
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user