From cc796b13584419afc741d747c02cb715adb9c019 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 18 Sep 2023 11:51:48 -0700 Subject: [PATCH] Convert before transpose (#1073) --- vllm/model_executor/models/llama.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e87f0073c520..0b7f4181a150 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -43,8 +43,8 @@ from vllm.model_executor.parallel_utils.tensor_parallel import ( VocabParallelEmbedding) from vllm.model_executor.quantization_utils import QuantizationConfig from vllm.model_executor.weight_utils import ( - load_tensor_parallel_weights, load_padded_tensor_parallel_vocab, - hf_model_weights_iterator) + convert_pyslice_to_tensor, hf_model_weights_iterator, + load_tensor_parallel_weights, load_padded_tensor_parallel_vocab) from vllm.sequence import SamplerOutput KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -337,6 +337,7 @@ class LlamaForCausalLM(nn.Module): is_packed = self.quant_config.is_packed(name) is_transposed = self.quant_config.is_transposed(name) if is_transposed: + loaded_weight = convert_pyslice_to_tensor(loaded_weight) loaded_weight = loaded_weight.T is_attention_weight = False