mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 10:40:44 +08:00
Signed-off-by: Wes Medford <wryanmedford@gmail.com>
This commit is contained in:
parent
8a49eea74b
commit
4ae17bf1e2
@ -189,11 +189,7 @@ def fused_moe_kernel_gptq_awq(
|
||||
mask=token_mask[:, None] &
|
||||
(offs_k[None, :] < K - k * BLOCK_SIZE_K),
|
||||
other=0.0)
|
||||
b = tl.load(
|
||||
b_ptrs,
|
||||
cache_modifier=".cg",
|
||||
eviction_policy="evict_last",
|
||||
)
|
||||
b = tl.load(b_ptrs)
|
||||
if use_int4_w4a16:
|
||||
b = (b >> b_shifter) & 0xF
|
||||
|
||||
@ -395,13 +391,9 @@ def fused_moe_kernel(
|
||||
mask=token_mask[:, None] &
|
||||
(offs_k[None, :] < K - k * BLOCK_SIZE_K),
|
||||
other=0.0)
|
||||
b = tl.load(
|
||||
b_ptrs,
|
||||
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
|
||||
other=0.0,
|
||||
cache_modifier=".cg",
|
||||
eviction_policy="evict_last",
|
||||
)
|
||||
b = tl.load(b_ptrs,
|
||||
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
|
||||
other=0.0)
|
||||
# We accumulate along the K dimension.
|
||||
if use_int8_w8a16:
|
||||
accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user