mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 16:35:43 +08:00
[Perf] Fix DeepGEMM Contiguous Layout Issue, 5.5% Throughput Improvement (#24783)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
parent
fec347dee1
commit
fc2dbcda8b
@ -772,10 +772,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
|
||||
if _is_col_major(layer.w13_weight_scale_inv):
|
||||
layer.w13_weight_scale_inv = \
|
||||
get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous()
|
||||
get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv)
|
||||
if _is_col_major(layer.w2_weight_scale_inv):
|
||||
layer.w2_weight_scale_inv = \
|
||||
get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous()
|
||||
get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv)
|
||||
|
||||
# If checkpoint is fp16, quantize in place.
|
||||
elif not self.quant_config.is_checkpoint_fp8_serialized:
|
||||
@ -923,10 +923,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
# Ensure column-major TMA alignment expected by DeepGEMM.
|
||||
if _is_col_major(layer.w13_weight_scale_inv):
|
||||
layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor(
|
||||
layer.w13_weight_scale_inv).contiguous()
|
||||
layer.w13_weight_scale_inv)
|
||||
if _is_col_major(layer.w2_weight_scale_inv):
|
||||
layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor(
|
||||
layer.w2_weight_scale_inv).contiguous()
|
||||
layer.w2_weight_scale_inv)
|
||||
|
||||
def select_gemm_impl(
|
||||
self,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user