mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 00:06:06 +08:00
[Kernels] Use empty for modular MoE workspaces (#19667)
Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
parent
836d4ce140
commit
5e5baa91aa
@ -716,6 +716,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
intermediate_cache2 = _resize_cache(workspace2,
|
||||
(E, max_num_tokens, N // 2))
|
||||
|
||||
if self.use_fp8_w8a8:
|
||||
intermediate_cache1.fill_(0)
|
||||
|
||||
# MM1
|
||||
invoke_moe_batched_triton_kernel(A=hidden_states,
|
||||
B=w1,
|
||||
|
||||
@ -426,10 +426,10 @@ class FusedMoEModularKernel(torch.nn.Module):
|
||||
|
||||
# We can reuse the memory between cache1 and cache3 because by the
|
||||
# time we need cache3, we're done with cache1.
|
||||
workspace13 = torch.zeros(prod(workspace13_shape),
|
||||
workspace13 = torch.empty(prod(workspace13_shape),
|
||||
device=a1.device,
|
||||
dtype=workspace_dtype)
|
||||
workspace2 = torch.zeros(prod(workspace2_shape),
|
||||
workspace2 = torch.empty(prod(workspace2_shape),
|
||||
device=a1.device,
|
||||
dtype=workspace_dtype)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user