mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 07:55:01 +08:00
[Doc]: fix typos in Python comments (#24115)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
d7e1e59972
commit
02d411fdb2
@ -218,7 +218,7 @@ if __name__ == "__main__":
|
||||
"--xaxis",
|
||||
type=str,
|
||||
default="# of max concurrency.",
|
||||
help="column name to use as X Axis in comparision graph",
|
||||
help="column name to use as X Axis in comparison graph",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -1104,7 +1104,7 @@ def create_argument_parser():
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
help="Comma-separated list of selected metrics to report percentiles. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||
'Default value is "ttft,tpot,itl".',
|
||||
|
||||
@ -998,7 +998,7 @@ def create_argument_parser():
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
help="Comma-separated list of selected metrics to report percentiles. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||
'Default value is "ttft,tpot,itl".',
|
||||
|
||||
@ -719,7 +719,7 @@ def create_argument_parser():
|
||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
||||
)
|
||||
|
||||
# hf dtaset
|
||||
# hf dataset
|
||||
parser.add_argument(
|
||||
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
||||
)
|
||||
|
||||
@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
|
||||
if not all_the_same(trace_eles)), None)
|
||||
|
||||
if first_trace_difference is None:
|
||||
# can't create a unique name, leave them names as the
|
||||
# can't create a unique name, leave the names as they
|
||||
# are they will get aggregated by the pivot_table call
|
||||
continue
|
||||
|
||||
|
||||
@ -513,7 +513,7 @@ if flashinfer_comm is not None:
|
||||
torch.ops._C.static_scaled_fp8_quant(
|
||||
quant_out, norm_out, scale_factor)
|
||||
if scale_factor is None or norm_out is not None:
|
||||
# we need to return allreduce outpput
|
||||
# we need to return allreduce output
|
||||
# in cases of non quant fused AR + RMS norm
|
||||
# and fused AR + RMS norm + quant without fused add
|
||||
allreduce_in.copy_(allreduce_out)
|
||||
|
||||
@ -49,7 +49,7 @@ class MQLLMEngine:
|
||||
|
||||
This class is used to wrap the
|
||||
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
|
||||
in concurrnet manner. It runs a background loop and uses zeromq to
|
||||
in concurrent manner. It runs a background loop and uses zeromq to
|
||||
receive new requests and stream outputs incrementally via ipc.
|
||||
|
||||
The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
|
||||
|
||||
@ -23,7 +23,7 @@ TORCH_DEVICE_IDENTITY = None
|
||||
# The condition to determine if it is on a platform that supports
|
||||
# torch._scaled_mm rowwise feature.
|
||||
# The condition is determined once as the operations
|
||||
# are time consuming.
|
||||
# are time-consuming.
|
||||
USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
|
||||
torch.__version__) >= version.parse("2.7")
|
||||
and current_platform.has_device_capability(94))
|
||||
|
||||
@ -211,7 +211,7 @@ class DefaultModelLoader(BaseModelLoader):
|
||||
|
||||
if not USE_TPU_COMMONS:
|
||||
# In PyTorch XLA, we should call `xm.mark_step`
|
||||
# requently so that not too many ops are accumulated
|
||||
# frequently so that not too many ops are accumulated
|
||||
# in the XLA program. import torch_xla.core.xla_model
|
||||
# as xm
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
@ -84,7 +84,7 @@ class XPUWorker(Worker):
|
||||
"""Profiles the peak memory usage of the model to determine how many
|
||||
KV blocks may be allocated without OOMs.
|
||||
The engine will first conduct a profiling of the existing memory usage.
|
||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||
Then, it calculates the maximum possible number of GPU and CPU blocks
|
||||
that can be allocated with the remaining free memory.
|
||||
.. tip::
|
||||
You may limit the usage of GPU memory
|
||||
|
||||
@ -234,7 +234,7 @@ class Worker(LocalOrDistributedWorkerBase):
|
||||
KV blocks may be allocated without OOMs.
|
||||
|
||||
The engine will first conduct a profiling of the existing memory usage.
|
||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||
Then, it calculates the maximum possible number of GPU and CPU blocks
|
||||
that can be allocated with the remaining free memory.
|
||||
|
||||
Tip:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user