diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 5e2aa70692566..9fefd88cd9b08 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache( const torch::Tensor& scheduler_metadata, const std::optional& s_aux); +// Note: just for avoiding importing errors +void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); } + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "sliding_window_left, SymInt sliding_window_right, Tensor block_table, " "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()", &cpu_attention_with_kv_cache); + + // placeholders + ops.def("static_scaled_fp8_quant() -> ()", placeholder_op); + ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op); + ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op); } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index ceb1cf64b5889..40f011fed1ada 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -80,9 +80,6 @@ class CPUModelRunner(GPUModelRunner): def _sync_device(self) -> None: pass - def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]: - return sampled_token_ids.tolist() - def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]: # Note: For CPU backend, dp padding is not required for now. return 0, None