mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-16 20:02:22 +08:00
[CPU][Bugfix] Fix _to_list in CPU model runner (#28824)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
parent
3380ed5e11
commit
577bb34fff
@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache(
|
|||||||
const torch::Tensor& scheduler_metadata,
|
const torch::Tensor& scheduler_metadata,
|
||||||
const std::optional<torch::Tensor>& s_aux);
|
const std::optional<torch::Tensor>& s_aux);
|
||||||
|
|
||||||
|
// Note: just for avoiding importing errors
|
||||||
|
void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||||
// vLLM custom ops
|
// vLLM custom ops
|
||||||
|
|
||||||
@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
|
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
|
||||||
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
|
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
|
||||||
&cpu_attention_with_kv_cache);
|
&cpu_attention_with_kv_cache);
|
||||||
|
|
||||||
|
// placeholders
|
||||||
|
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
|
||||||
|
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
|
||||||
|
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||||
|
|||||||
@ -80,9 +80,6 @@ class CPUModelRunner(GPUModelRunner):
|
|||||||
def _sync_device(self) -> None:
|
def _sync_device(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
|
|
||||||
return sampled_token_ids.tolist()
|
|
||||||
|
|
||||||
def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]:
|
def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]:
|
||||||
# Note: For CPU backend, dp padding is not required for now.
|
# Note: For CPU backend, dp padding is not required for now.
|
||||||
return 0, None
|
return 0, None
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user