mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 21:15:28 +08:00
[Kernel] Add punica dimension for Baichuan-13B (#4053)
This commit is contained in:
parent
0a430b4ae2
commit
989ae2538d
@ -47,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 13696) \
|
f(in_T, out_T, W_T, narrow, 13696) \
|
||||||
f(in_T, out_T, W_T, narrow, 13824) \
|
f(in_T, out_T, W_T, narrow, 13824) \
|
||||||
f(in_T, out_T, W_T, narrow, 14336) \
|
f(in_T, out_T, W_T, narrow, 14336) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 15360) \
|
||||||
f(in_T, out_T, W_T, narrow, 16384) \
|
f(in_T, out_T, W_T, narrow, 16384) \
|
||||||
f(in_T, out_T, W_T, narrow, 20480) \
|
f(in_T, out_T, W_T, narrow, 20480) \
|
||||||
f(in_T, out_T, W_T, narrow, 22016) \
|
f(in_T, out_T, W_T, narrow, 22016) \
|
||||||
|
|||||||
@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip("Requires multiple GPUs")
|
@pytest.mark.skip("Requires multiple GPUs")
|
||||||
def test_llama_tensor_parallel_equality(baichuan_lora_files):
|
def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
|
||||||
# Cannot use as it will initialize torch.cuda too early...
|
# Cannot use as it will initialize torch.cuda too early...
|
||||||
# if torch.cuda.device_count() < 4:
|
# if torch.cuda.device_count() < 4:
|
||||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
|
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
|
||||||
|
|||||||
@ -72,6 +72,7 @@ H1 = H2 = [
|
|||||||
11008,
|
11008,
|
||||||
13824,
|
13824,
|
||||||
14336,
|
14336,
|
||||||
|
15360,
|
||||||
22016,
|
22016,
|
||||||
24576,
|
24576,
|
||||||
27392,
|
27392,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user