mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 15:27:13 +08:00
[Bugfix][B200] Fix cutlass_mla hang (#24966)
Signed-off-by: Alexander Matveev <amatveev@redhat.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
parent
bff2e5f1d6
commit
fedb75fa27
@ -133,6 +133,14 @@ public:
|
||||
// printf(" sm_count = %d\n", sm_count);
|
||||
int max_splits = ceil_div(K, 128);
|
||||
max_splits = min(16, max_splits);
|
||||
|
||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||
// there is more than 4 kv_splits.
|
||||
// Discuss with NVIDIA how this can be fixed.
|
||||
if (B > 1) {
|
||||
max_splits = min(2, max_splits);
|
||||
}
|
||||
|
||||
// printf(" max_splits = %d\n", max_splits);
|
||||
int sms_per_batch = max(1, sm_count / B);
|
||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user