mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 10:29:50 +08:00
[Model] Granite-4 support loading quantized checkpoint (#22925)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
This commit is contained in:
parent
8ad7285ea2
commit
6cd69f51bf
@ -471,7 +471,10 @@ class GraniteMoeHybridModel(nn.Module):
|
|||||||
# Mapping different experts' layout:
|
# Mapping different experts' layout:
|
||||||
# from HF (input_linear, output_linear, router)
|
# from HF (input_linear, output_linear, router)
|
||||||
# to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
|
# to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
|
||||||
if n.endswith('.block_sparse_moe.input_linear.weight'):
|
# The renaming and parameter loading logic is the same for weight
|
||||||
|
# and weight_scale tensors so we can reuse them without issues.
|
||||||
|
if (n.endswith('.block_sparse_moe.input_linear.weight') or
|
||||||
|
n.endswith('.block_sparse_moe.input_linear.weight_scale')):
|
||||||
for e in range(p.size(0)):
|
for e in range(p.size(0)):
|
||||||
w1_name = n.replace(
|
w1_name = n.replace(
|
||||||
'.block_sparse_moe.input_linear.weight',
|
'.block_sparse_moe.input_linear.weight',
|
||||||
@ -490,7 +493,8 @@ class GraniteMoeHybridModel(nn.Module):
|
|||||||
w3_name,
|
w3_name,
|
||||||
shard_id='w3',
|
shard_id='w3',
|
||||||
expert_id=e)
|
expert_id=e)
|
||||||
elif n.endswith('.block_sparse_moe.output_linear.weight'):
|
elif (n.endswith('.block_sparse_moe.output_linear.weight') or
|
||||||
|
n.endswith('.block_sparse_moe.output_linear.weight_scale')):
|
||||||
for e in range(p.size(0)):
|
for e in range(p.size(0)):
|
||||||
w2_name = n.replace(
|
w2_name = n.replace(
|
||||||
'.block_sparse_moe.output_linear.weight',
|
'.block_sparse_moe.output_linear.weight',
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user