mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 20:44:27 +08:00
[Kernel] Turn off CUTLASS scaled_mm for Ada Lovelace (#6384)
This commit is contained in:
parent
6ef3bf912c
commit
9dad5cc859
@ -1,11 +1,11 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.752
|
value: 0.755
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.752
|
value: 0.755
|
||||||
limit: 250
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
|||||||
@ -4,8 +4,8 @@ tasks:
|
|||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.756
|
value: 0.753
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.752
|
value: 0.753
|
||||||
limit: 250
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
|||||||
@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
|
|||||||
if (cuda_device_capability >= 90) {
|
if (cuda_device_capability >= 90) {
|
||||||
return CUDA_VERSION >= 12000;
|
return CUDA_VERSION >= 12000;
|
||||||
} else if (cuda_device_capability >= 89) {
|
} else if (cuda_device_capability >= 89) {
|
||||||
return CUDA_VERSION >= 12040;
|
// CUTLASS Kernels have not been tuned for Ada Lovelace systems
|
||||||
|
// and are slower than torch.mm. Return false unconditionally in this case.
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Once the CUTLASS kernels have been optimized for Lovelace systems,
|
||||||
|
// use the following check:
|
||||||
|
// return CUDA_VERSION >= 12040;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user