mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 06:35:00 +08:00
Convert before transpose (#1073)
This commit is contained in:
parent
f029ef94d7
commit
cc796b1358
@ -43,8 +43,8 @@ from vllm.model_executor.parallel_utils.tensor_parallel import (
|
|||||||
VocabParallelEmbedding)
|
VocabParallelEmbedding)
|
||||||
from vllm.model_executor.quantization_utils import QuantizationConfig
|
from vllm.model_executor.quantization_utils import QuantizationConfig
|
||||||
from vllm.model_executor.weight_utils import (
|
from vllm.model_executor.weight_utils import (
|
||||||
load_tensor_parallel_weights, load_padded_tensor_parallel_vocab,
|
convert_pyslice_to_tensor, hf_model_weights_iterator,
|
||||||
hf_model_weights_iterator)
|
load_tensor_parallel_weights, load_padded_tensor_parallel_vocab)
|
||||||
from vllm.sequence import SamplerOutput
|
from vllm.sequence import SamplerOutput
|
||||||
|
|
||||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||||
@ -337,6 +337,7 @@ class LlamaForCausalLM(nn.Module):
|
|||||||
is_packed = self.quant_config.is_packed(name)
|
is_packed = self.quant_config.is_packed(name)
|
||||||
is_transposed = self.quant_config.is_transposed(name)
|
is_transposed = self.quant_config.is_transposed(name)
|
||||||
if is_transposed:
|
if is_transposed:
|
||||||
|
loaded_weight = convert_pyslice_to_tensor(loaded_weight)
|
||||||
loaded_weight = loaded_weight.T
|
loaded_weight = loaded_weight.T
|
||||||
|
|
||||||
is_attention_weight = False
|
is_attention_weight = False
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user