mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 05:37:03 +08:00
add missing kernels for cuda dispatch
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
This commit is contained in:
parent
f10171cb3d
commit
fb72ec8218
@ -17,6 +17,10 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
|
||||
CutlassFP8ScaledMMLinearKernel,
|
||||
CutlassScaledMMLinearKernel,
|
||||
)
|
||||
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.flashinfer import (
|
||||
FlashInferScaledMMLinearKernel
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch import (
|
||||
ChannelWiseTorchScaledMMLinearKernel,
|
||||
PerTensorTorchScaledMMLinearKernel,
|
||||
@ -54,7 +58,13 @@ _POSSIBLE_INT8_KERNELS: dict[PlatformEnum, list[type[Int8ScaledMMLinearKernel]]]
|
||||
|
||||
# in priority/performance order (when available)
|
||||
_POSSIBLE_FP8_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] = {
|
||||
PlatformEnum.CUDA: [CutlassFP8ScaledMMLinearKernel],
|
||||
PlatformEnum.CUDA: [
|
||||
FlashInferScaledMMLinearKernel,
|
||||
CutlassFP8ScaledMMLinearKernel,
|
||||
PerTensorTorchScaledMMLinearKernel,
|
||||
RowWiseTorchScaledMMLinearKernel,
|
||||
ChannelWiseTorchScaledMMLinearKernel,
|
||||
],
|
||||
PlatformEnum.ROCM: [
|
||||
ROCmScaledMMLinearKernel,
|
||||
PerTensorTorchScaledMMLinearKernel,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user