[Bugfix] Fix Stream usage in CPU model runner and OneDNN kernel check (#25046)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
Li, Jiang 2025-09-17 20:54:02 +08:00 committed by GitHub
parent 252ada5559
commit 9fccd04e30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 14 additions and 1 deletions

View File

@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
CPU_KERNEL_GUARD_IN(onednn_mm)
TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.stride(-1) == 1);
TORCH_CHECK(c.is_contiguous());
TORCH_CHECK(c.stride(-1) == 1);
MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler);

View File

@ -185,6 +185,11 @@ class CpuPlatform(Platform):
parallel_config.distributed_executor_backend = "mp"
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
# Disable DBO
if parallel_config.enable_dbo:
logger.warning(
"Dual-Batch Overlap is not supported on CPU, disabled.")
parallel_config.enable_dbo = False
# Note: workaround for v1 gpu_model_runner
from vllm.config import CompilationLevel

View File

@ -145,12 +145,20 @@ def _torch_cuda_wrapper():
self.record = lambda: None
self.synchronize = lambda: None
class _StreamPlaceholder:
def __init__(self, *args, **kwargs) -> None:
pass
cuda_event = torch.cuda.Event
cuda_stream = torch.cuda.Stream
try:
torch.cuda.Event = _EventPlaceholder
torch.cuda.Stream = _StreamPlaceholder
yield
finally:
torch.cuda.Event = cuda_event
torch.cuda.Stream = cuda_stream
@contextmanager