mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-16 16:36:02 +08:00
[Bugfix] Fix Stream usage in CPU model runner and OneDNN kernel check (#25046)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
parent
252ada5559
commit
9fccd04e30
@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
|
|||||||
CPU_KERNEL_GUARD_IN(onednn_mm)
|
CPU_KERNEL_GUARD_IN(onednn_mm)
|
||||||
TORCH_CHECK(a.dim() == 2);
|
TORCH_CHECK(a.dim() == 2);
|
||||||
TORCH_CHECK(a.stride(-1) == 1);
|
TORCH_CHECK(a.stride(-1) == 1);
|
||||||
TORCH_CHECK(c.is_contiguous());
|
TORCH_CHECK(c.stride(-1) == 1);
|
||||||
MatMulPrimitiveHandler* ptr =
|
MatMulPrimitiveHandler* ptr =
|
||||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
||||||
|
|
||||||
|
|||||||
@ -185,6 +185,11 @@ class CpuPlatform(Platform):
|
|||||||
parallel_config.distributed_executor_backend = "mp"
|
parallel_config.distributed_executor_backend = "mp"
|
||||||
if parallel_config.worker_cls == "auto":
|
if parallel_config.worker_cls == "auto":
|
||||||
parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
|
parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker"
|
||||||
|
# Disable DBO
|
||||||
|
if parallel_config.enable_dbo:
|
||||||
|
logger.warning(
|
||||||
|
"Dual-Batch Overlap is not supported on CPU, disabled.")
|
||||||
|
parallel_config.enable_dbo = False
|
||||||
|
|
||||||
# Note: workaround for v1 gpu_model_runner
|
# Note: workaround for v1 gpu_model_runner
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
|
|||||||
@ -145,12 +145,20 @@ def _torch_cuda_wrapper():
|
|||||||
self.record = lambda: None
|
self.record = lambda: None
|
||||||
self.synchronize = lambda: None
|
self.synchronize = lambda: None
|
||||||
|
|
||||||
|
class _StreamPlaceholder:
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
cuda_event = torch.cuda.Event
|
cuda_event = torch.cuda.Event
|
||||||
|
cuda_stream = torch.cuda.Stream
|
||||||
try:
|
try:
|
||||||
torch.cuda.Event = _EventPlaceholder
|
torch.cuda.Event = _EventPlaceholder
|
||||||
|
torch.cuda.Stream = _StreamPlaceholder
|
||||||
yield
|
yield
|
||||||
finally:
|
finally:
|
||||||
torch.cuda.Event = cuda_event
|
torch.cuda.Event = cuda_event
|
||||||
|
torch.cuda.Stream = cuda_stream
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user