diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3ab1115f14462..f24c50ad73261 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1433,15 +1433,15 @@ class EngineArgs: recommend_to_remove=True) return False - # Need at least Ampere for now (FA support required). - # Skip this check if we are running on a non-GPU platform, - # or if the device capability is not available - # (e.g. in a Ray actor without GPUs). + # Triton v3.3 has f16 conversion regression issue on Turing and Volta, + # which broke fp16 inference + # see: https://github.com/triton-lang/triton/issues/6698 if (current_platform.is_cuda() - and current_platform.get_device_capability() - and current_platform.get_device_capability().major < 8): - _raise_or_fallback(feature_name="Compute Capability < 8.0", - recommend_to_remove=False) + and not current_platform.has_device_capability(80) + and model_config.dtype == torch.float16): + _raise_or_fallback( + feature_name="Compute Capability < 8.0 with FP16", + recommend_to_remove=False) return False if self.kv_cache_dtype != "auto":