mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 09:01:40 +08:00
[BugFix] Fix GGUF tp>1 when vocab_size is not divisible by 64 (#12230)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
parent
d4b62d4641
commit
5fe6bf29d6
@ -66,12 +66,20 @@ STARCODER_CONFIG = GGUFTestConfig(
|
|||||||
gguf_filename="starcoder2-3b.Q6_K.gguf",
|
gguf_filename="starcoder2-3b.Q6_K.gguf",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
DOLPHIN_CONFIG = GGUFTestConfig(
|
||||||
|
# Test VocabParallelEmbedding sharding issue.
|
||||||
|
original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
|
||||||
|
gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
|
||||||
|
gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
|
||||||
|
)
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
LLAMA_CONFIG,
|
LLAMA_CONFIG,
|
||||||
QWEN2_CONFIG,
|
QWEN2_CONFIG,
|
||||||
PHI3_CONFIG,
|
PHI3_CONFIG,
|
||||||
GPT2_CONFIG,
|
GPT2_CONFIG,
|
||||||
STABLELM_CONFIG,
|
STABLELM_CONFIG,
|
||||||
|
DOLPHIN_CONFIG
|
||||||
# STARCODER_CONFIG, # broken
|
# STARCODER_CONFIG, # broken
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -107,6 +115,7 @@ def test_models(
|
|||||||
|
|
||||||
# Run unquantized model.
|
# Run unquantized model.
|
||||||
with vllm_runner(model_name=model.original_model,
|
with vllm_runner(model_name=model.original_model,
|
||||||
|
enforce_eager=True, # faster tests
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
tensor_parallel_size=tp_size) as original_model:
|
tensor_parallel_size=tp_size) as original_model:
|
||||||
@ -115,6 +124,7 @@ def test_models(
|
|||||||
|
|
||||||
# Run gguf model.
|
# Run gguf model.
|
||||||
with vllm_runner(model_name=model.gguf_model,
|
with vllm_runner(model_name=model.gguf_model,
|
||||||
|
enforce_eager=True,
|
||||||
tokenizer_name=model.original_model,
|
tokenizer_name=model.original_model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
|
|||||||
@ -355,7 +355,7 @@ class VocabParallelEmbedding(torch.nn.Module):
|
|||||||
elif isinstance(param, UninitializedParameter):
|
elif isinstance(param, UninitializedParameter):
|
||||||
shape = list(loaded_weight.shape)
|
shape = list(loaded_weight.shape)
|
||||||
if output_dim is not None:
|
if output_dim is not None:
|
||||||
shape[output_dim] = shape[output_dim] // self.tp_size
|
shape[output_dim] = self.num_embeddings_per_partition
|
||||||
param.materialize(tuple(shape), dtype=loaded_weight.dtype)
|
param.materialize(tuple(shape), dtype=loaded_weight.dtype)
|
||||||
|
|
||||||
# If parameter does not have output dim, then it should
|
# If parameter does not have output dim, then it should
|
||||||
@ -381,7 +381,7 @@ class VocabParallelEmbedding(torch.nn.Module):
|
|||||||
else:
|
else:
|
||||||
assert loaded_weight.shape[output_dim] == self.org_vocab_size
|
assert loaded_weight.shape[output_dim] == self.org_vocab_size
|
||||||
|
|
||||||
# Copy the data.
|
# Copy the data. Select chunk corresponding to current shard.
|
||||||
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
|
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
|
||||||
|
|
||||||
if current_platform.is_hpu():
|
if current_platform.is_hpu():
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user