mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-24 03:11:18 +08:00
add missing kernels for cuda dispatch
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
This commit is contained in:
parent
f10171cb3d
commit
fb72ec8218
@ -17,6 +17,10 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
|
|||||||
CutlassFP8ScaledMMLinearKernel,
|
CutlassFP8ScaledMMLinearKernel,
|
||||||
CutlassScaledMMLinearKernel,
|
CutlassScaledMMLinearKernel,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.kernels.scaled_mm.flashinfer import (
|
||||||
|
FlashInferScaledMMLinearKernel
|
||||||
|
)
|
||||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch import (
|
from vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch import (
|
||||||
ChannelWiseTorchScaledMMLinearKernel,
|
ChannelWiseTorchScaledMMLinearKernel,
|
||||||
PerTensorTorchScaledMMLinearKernel,
|
PerTensorTorchScaledMMLinearKernel,
|
||||||
@ -54,7 +58,13 @@ _POSSIBLE_INT8_KERNELS: dict[PlatformEnum, list[type[Int8ScaledMMLinearKernel]]]
|
|||||||
|
|
||||||
# in priority/performance order (when available)
|
# in priority/performance order (when available)
|
||||||
_POSSIBLE_FP8_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] = {
|
_POSSIBLE_FP8_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] = {
|
||||||
PlatformEnum.CUDA: [CutlassFP8ScaledMMLinearKernel],
|
PlatformEnum.CUDA: [
|
||||||
|
FlashInferScaledMMLinearKernel,
|
||||||
|
CutlassFP8ScaledMMLinearKernel,
|
||||||
|
PerTensorTorchScaledMMLinearKernel,
|
||||||
|
RowWiseTorchScaledMMLinearKernel,
|
||||||
|
ChannelWiseTorchScaledMMLinearKernel,
|
||||||
|
],
|
||||||
PlatformEnum.ROCM: [
|
PlatformEnum.ROCM: [
|
||||||
ROCmScaledMMLinearKernel,
|
ROCmScaledMMLinearKernel,
|
||||||
PerTensorTorchScaledMMLinearKernel,
|
PerTensorTorchScaledMMLinearKernel,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user