mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 15:37:13 +08:00
fix: typos (#18151)
Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>
This commit is contained in:
parent
a8f5aec20a
commit
a9944aabfa
@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
|
||||
|
||||
// Load the query to registers.
|
||||
// Each thread in a thread group has a different part of the query.
|
||||
// For example, if the the thread group size is 4, then the first thread in
|
||||
// For example, if the thread group size is 4, then the first thread in
|
||||
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
|
||||
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
|
||||
// q is split from a qkv tensor, it may not be contiguous.
|
||||
@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
|
||||
|
||||
// Load a key to registers.
|
||||
// Each thread in a thread group has a different part of the key.
|
||||
// For example, if the the thread group size is 4, then the first thread in
|
||||
// For example, if the thread group size is 4, then the first thread in
|
||||
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
|
||||
// has 1, 5, 9, ... th vectors of the key, and so on.
|
||||
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
|
||||
|
||||
@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
|
||||
"partly cloudly, with highs in the 90's.")
|
||||
|
||||
|
||||
tool_funtions = {"get_current_weather": get_current_weather}
|
||||
tool_functions = {"get_current_weather": get_current_weather}
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
@ -122,7 +122,7 @@ messages.append({
|
||||
# above defined function
|
||||
tool_calls = json.loads(output)
|
||||
tool_answers = [
|
||||
tool_funtions[call['name']](**call['arguments']) for call in tool_calls
|
||||
tool_functions[call['name']](**call['arguments']) for call in tool_calls
|
||||
]
|
||||
|
||||
# append the answer as a tool message and let the LLM give you an answer
|
||||
|
||||
@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
|
||||
|
||||
lora_path = get_adapter_absolute_path(lora_name)
|
||||
|
||||
# lora loading should work for either absolute path and hugggingface id.
|
||||
# lora loading should work for either absolute path and huggingface id.
|
||||
peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
|
||||
lora_model = LoRAModel.from_local_checkpoint(
|
||||
lora_path,
|
||||
|
||||
@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
|
||||
try:
|
||||
# enable hf hub transfer if available
|
||||
import hf_transfer # type: ignore # noqa
|
||||
HF_TRANFER_ACTIVE = True
|
||||
HF_TRANSFER_ACTIVE = True
|
||||
except ImportError:
|
||||
HF_TRANFER_ACTIVE = False
|
||||
HF_TRANSFER_ACTIVE = False
|
||||
assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
|
||||
HF_TRANFER_ACTIVE)
|
||||
HF_TRANSFER_ACTIVE)
|
||||
|
||||
|
||||
def test_download_weights_from_hf():
|
||||
|
||||
@ -297,7 +297,7 @@ class ModelConfig:
|
||||
- 1K -> 1024\n
|
||||
- 25.6k -> 25,600"""
|
||||
spec_target_max_model_len: Optional[int] = None
|
||||
"""Specify the the maximum length for spec decoding draft models."""
|
||||
"""Specify the maximum length for spec decoding draft models."""
|
||||
quantization: Optional[QuantizationMethods] = None
|
||||
"""Method used to quantize the weights. If `None`, we first check the
|
||||
`quantization_config` attribute in the model config file. If that is
|
||||
|
||||
@ -153,7 +153,7 @@ def _lora_expand(
|
||||
lora_token_start_loc (torch.Tensor): A cumulative sum of
|
||||
num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
|
||||
lora_token_start_loc[i], along with num_tokens_per_lora[i]
|
||||
identifies the the region in token_indices_sorted_by_lora_ids that
|
||||
identifies the region in token_indices_sorted_by_lora_ids that
|
||||
LoRA lora_ids[i] should process.
|
||||
lora_ids (torch.Tensor): LoRA ids to process.
|
||||
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
|
||||
|
||||
@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader(
|
||||
) -> LoaderFunction:
|
||||
"""Create a weight loader for mamba v2. This ensures that the projections
|
||||
are correctly sharded so that they can be split into x, B, C. It also
|
||||
ensures the the all the groups corresponding to a head shard is placed
|
||||
ensures that all the groups corresponding to a head shard is placed
|
||||
together with it.
|
||||
"""
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only IBM Granite speeech model."""
|
||||
"""Inference-only IBM Granite speech model."""
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import Optional, TypedDict, Union
|
||||
@ -626,7 +626,7 @@ class GraniteSpeechForConditionalGeneration(
|
||||
audio_embed_sizes: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""Calculate the input features mask, which will generally be used
|
||||
to mask the the padded features for all entries in the batch except
|
||||
to mask the padded features for all entries in the batch except
|
||||
for those with the most audio features.
|
||||
|
||||
Args:
|
||||
|
||||
@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module):
|
||||
if set to True, use GLULinear module,
|
||||
otherwise, used GLUPointWiseConv module.
|
||||
default to False.
|
||||
attention_innner_dim: int, optional
|
||||
attention_inner_dim: int, optional
|
||||
if equal to -1, attention dim for linears k/q/v is
|
||||
equal to d_model. otherwise attention_innner_dim is used.
|
||||
equal to d_model. otherwise attention_inner_dim is used.
|
||||
default -1.
|
||||
attention_glu_type: str, optional
|
||||
activation function for glu used in the multihead attention,
|
||||
@ -148,7 +148,7 @@ class ConformerEncoderLayer(nn.Module):
|
||||
conv_glu_type="sigmoid",
|
||||
bias_in_glu=True,
|
||||
linear_glu_in_convm=False,
|
||||
attention_innner_dim=-1,
|
||||
attention_inner_dim=-1,
|
||||
attention_glu_type="swish",
|
||||
activation_checkpointing="",
|
||||
export=False,
|
||||
@ -169,7 +169,7 @@ class ConformerEncoderLayer(nn.Module):
|
||||
n_head,
|
||||
d_model,
|
||||
dropout_rate,
|
||||
attention_innner_dim,
|
||||
attention_inner_dim,
|
||||
attention_glu_type,
|
||||
bias_in_glu,
|
||||
use_pt_scaled_dot_product_attention=
|
||||
|
||||
@ -72,7 +72,7 @@ class Request:
|
||||
assert len(self.mm_inputs) == len(self.mm_hashes)
|
||||
|
||||
# Read-only views
|
||||
# Prevent directly appending to the these lists since
|
||||
# Prevent directly appending to these lists since
|
||||
# they should also be updated simultaneously.
|
||||
self.output_token_ids = ConstantList(self._output_token_ids)
|
||||
self.all_token_ids = ConstantList(self._all_token_ids)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user