mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-20 14:17:04 +08:00
fix: typos (#18151)
Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>
This commit is contained in:
parent
a8f5aec20a
commit
a9944aabfa
@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
|
|||||||
|
|
||||||
// Load the query to registers.
|
// Load the query to registers.
|
||||||
// Each thread in a thread group has a different part of the query.
|
// Each thread in a thread group has a different part of the query.
|
||||||
// For example, if the the thread group size is 4, then the first thread in
|
// For example, if the thread group size is 4, then the first thread in
|
||||||
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
|
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
|
||||||
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
|
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
|
||||||
// q is split from a qkv tensor, it may not be contiguous.
|
// q is split from a qkv tensor, it may not be contiguous.
|
||||||
@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
|
|||||||
|
|
||||||
// Load a key to registers.
|
// Load a key to registers.
|
||||||
// Each thread in a thread group has a different part of the key.
|
// Each thread in a thread group has a different part of the key.
|
||||||
// For example, if the the thread group size is 4, then the first thread in
|
// For example, if the thread group size is 4, then the first thread in
|
||||||
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
|
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
|
||||||
// has 1, 5, 9, ... th vectors of the key, and so on.
|
// has 1, 5, 9, ... th vectors of the key, and so on.
|
||||||
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
|
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
|
||||||
|
|||||||
@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
|
|||||||
"partly cloudly, with highs in the 90's.")
|
"partly cloudly, with highs in the 90's.")
|
||||||
|
|
||||||
|
|
||||||
tool_funtions = {"get_current_weather": get_current_weather}
|
tool_functions = {"get_current_weather": get_current_weather}
|
||||||
|
|
||||||
tools = [{
|
tools = [{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
@ -122,7 +122,7 @@ messages.append({
|
|||||||
# above defined function
|
# above defined function
|
||||||
tool_calls = json.loads(output)
|
tool_calls = json.loads(output)
|
||||||
tool_answers = [
|
tool_answers = [
|
||||||
tool_funtions[call['name']](**call['arguments']) for call in tool_calls
|
tool_functions[call['name']](**call['arguments']) for call in tool_calls
|
||||||
]
|
]
|
||||||
|
|
||||||
# append the answer as a tool message and let the LLM give you an answer
|
# append the answer as a tool message and let the LLM give you an answer
|
||||||
|
|||||||
@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
|
|||||||
|
|
||||||
lora_path = get_adapter_absolute_path(lora_name)
|
lora_path = get_adapter_absolute_path(lora_name)
|
||||||
|
|
||||||
# lora loading should work for either absolute path and hugggingface id.
|
# lora loading should work for either absolute path and huggingface id.
|
||||||
peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
|
peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
|
||||||
lora_model = LoRAModel.from_local_checkpoint(
|
lora_model = LoRAModel.from_local_checkpoint(
|
||||||
lora_path,
|
lora_path,
|
||||||
|
|||||||
@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
|
|||||||
try:
|
try:
|
||||||
# enable hf hub transfer if available
|
# enable hf hub transfer if available
|
||||||
import hf_transfer # type: ignore # noqa
|
import hf_transfer # type: ignore # noqa
|
||||||
HF_TRANFER_ACTIVE = True
|
HF_TRANSFER_ACTIVE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
HF_TRANFER_ACTIVE = False
|
HF_TRANSFER_ACTIVE = False
|
||||||
assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
|
assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
|
||||||
HF_TRANFER_ACTIVE)
|
HF_TRANSFER_ACTIVE)
|
||||||
|
|
||||||
|
|
||||||
def test_download_weights_from_hf():
|
def test_download_weights_from_hf():
|
||||||
|
|||||||
@ -297,7 +297,7 @@ class ModelConfig:
|
|||||||
- 1K -> 1024\n
|
- 1K -> 1024\n
|
||||||
- 25.6k -> 25,600"""
|
- 25.6k -> 25,600"""
|
||||||
spec_target_max_model_len: Optional[int] = None
|
spec_target_max_model_len: Optional[int] = None
|
||||||
"""Specify the the maximum length for spec decoding draft models."""
|
"""Specify the maximum length for spec decoding draft models."""
|
||||||
quantization: Optional[QuantizationMethods] = None
|
quantization: Optional[QuantizationMethods] = None
|
||||||
"""Method used to quantize the weights. If `None`, we first check the
|
"""Method used to quantize the weights. If `None`, we first check the
|
||||||
`quantization_config` attribute in the model config file. If that is
|
`quantization_config` attribute in the model config file. If that is
|
||||||
|
|||||||
@ -153,7 +153,7 @@ def _lora_expand(
|
|||||||
lora_token_start_loc (torch.Tensor): A cumulative sum of
|
lora_token_start_loc (torch.Tensor): A cumulative sum of
|
||||||
num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
|
num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
|
||||||
lora_token_start_loc[i], along with num_tokens_per_lora[i]
|
lora_token_start_loc[i], along with num_tokens_per_lora[i]
|
||||||
identifies the the region in token_indices_sorted_by_lora_ids that
|
identifies the region in token_indices_sorted_by_lora_ids that
|
||||||
LoRA lora_ids[i] should process.
|
LoRA lora_ids[i] should process.
|
||||||
lora_ids (torch.Tensor): LoRA ids to process.
|
lora_ids (torch.Tensor): LoRA ids to process.
|
||||||
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
|
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
|
||||||
|
|||||||
@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader(
|
|||||||
) -> LoaderFunction:
|
) -> LoaderFunction:
|
||||||
"""Create a weight loader for mamba v2. This ensures that the projections
|
"""Create a weight loader for mamba v2. This ensures that the projections
|
||||||
are correctly sharded so that they can be split into x, B, C. It also
|
are correctly sharded so that they can be split into x, B, C. It also
|
||||||
ensures the the all the groups corresponding to a head shard is placed
|
ensures that all the groups corresponding to a head shard is placed
|
||||||
together with it.
|
together with it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@ -21,7 +21,7 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Inference-only IBM Granite speeech model."""
|
"""Inference-only IBM Granite speech model."""
|
||||||
import math
|
import math
|
||||||
from collections.abc import Iterable, Mapping
|
from collections.abc import Iterable, Mapping
|
||||||
from typing import Optional, TypedDict, Union
|
from typing import Optional, TypedDict, Union
|
||||||
@ -626,7 +626,7 @@ class GraniteSpeechForConditionalGeneration(
|
|||||||
audio_embed_sizes: torch.Tensor,
|
audio_embed_sizes: torch.Tensor,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
"""Calculate the input features mask, which will generally be used
|
"""Calculate the input features mask, which will generally be used
|
||||||
to mask the the padded features for all entries in the batch except
|
to mask the padded features for all entries in the batch except
|
||||||
for those with the most audio features.
|
for those with the most audio features.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
if set to True, use GLULinear module,
|
if set to True, use GLULinear module,
|
||||||
otherwise, used GLUPointWiseConv module.
|
otherwise, used GLUPointWiseConv module.
|
||||||
default to False.
|
default to False.
|
||||||
attention_innner_dim: int, optional
|
attention_inner_dim: int, optional
|
||||||
if equal to -1, attention dim for linears k/q/v is
|
if equal to -1, attention dim for linears k/q/v is
|
||||||
equal to d_model. otherwise attention_innner_dim is used.
|
equal to d_model. otherwise attention_inner_dim is used.
|
||||||
default -1.
|
default -1.
|
||||||
attention_glu_type: str, optional
|
attention_glu_type: str, optional
|
||||||
activation function for glu used in the multihead attention,
|
activation function for glu used in the multihead attention,
|
||||||
@ -148,7 +148,7 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
conv_glu_type="sigmoid",
|
conv_glu_type="sigmoid",
|
||||||
bias_in_glu=True,
|
bias_in_glu=True,
|
||||||
linear_glu_in_convm=False,
|
linear_glu_in_convm=False,
|
||||||
attention_innner_dim=-1,
|
attention_inner_dim=-1,
|
||||||
attention_glu_type="swish",
|
attention_glu_type="swish",
|
||||||
activation_checkpointing="",
|
activation_checkpointing="",
|
||||||
export=False,
|
export=False,
|
||||||
@ -169,7 +169,7 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
n_head,
|
n_head,
|
||||||
d_model,
|
d_model,
|
||||||
dropout_rate,
|
dropout_rate,
|
||||||
attention_innner_dim,
|
attention_inner_dim,
|
||||||
attention_glu_type,
|
attention_glu_type,
|
||||||
bias_in_glu,
|
bias_in_glu,
|
||||||
use_pt_scaled_dot_product_attention=
|
use_pt_scaled_dot_product_attention=
|
||||||
|
|||||||
@ -72,7 +72,7 @@ class Request:
|
|||||||
assert len(self.mm_inputs) == len(self.mm_hashes)
|
assert len(self.mm_inputs) == len(self.mm_hashes)
|
||||||
|
|
||||||
# Read-only views
|
# Read-only views
|
||||||
# Prevent directly appending to the these lists since
|
# Prevent directly appending to these lists since
|
||||||
# they should also be updated simultaneously.
|
# they should also be updated simultaneously.
|
||||||
self.output_token_ids = ConstantList(self._output_token_ids)
|
self.output_token_ids = ConstantList(self._output_token_ids)
|
||||||
self.all_token_ids = ConstantList(self._all_token_ids)
|
self.all_token_ids = ConstantList(self._all_token_ids)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user