diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 1e340162ddb5..773fed36a9fb 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -54,6 +54,7 @@ class GPTNeoXAttention(nn.Module): self.total_num_heads = config.num_attention_heads self.hidden_size = config.hidden_size self.head_size = self.hidden_size // self.total_num_heads + self.bias = getattr(config, "attention_bias", True) tensor_model_parallel_world_size = ( get_tensor_model_parallel_world_size()) @@ -65,11 +66,13 @@ class GPTNeoXAttention(nn.Module): config.hidden_size, self.head_size, self.total_num_heads, + bias=self.bias, linear_method=linear_method, ) self.dense = RowParallelLinear( config.hidden_size, config.hidden_size, + bias=self.bias, linear_method=linear_method, ) scaling = self.head_size**-0.5