mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 00:25:49 +08:00
[Doc]: fix typos in Python comments (#24115)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
parent
d7e1e59972
commit
02d411fdb2
@ -218,7 +218,7 @@ if __name__ == "__main__":
|
|||||||
"--xaxis",
|
"--xaxis",
|
||||||
type=str,
|
type=str,
|
||||||
default="# of max concurrency.",
|
default="# of max concurrency.",
|
||||||
help="column name to use as X Axis in comparision graph",
|
help="column name to use as X Axis in comparison graph",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|||||||
@ -1104,7 +1104,7 @@ def create_argument_parser():
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-separated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentiles. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||||
'Default value is "ttft,tpot,itl".',
|
'Default value is "ttft,tpot,itl".',
|
||||||
|
|||||||
@ -998,7 +998,7 @@ def create_argument_parser():
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-separated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentiles. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||||
'Default value is "ttft,tpot,itl".',
|
'Default value is "ttft,tpot,itl".',
|
||||||
|
|||||||
@ -719,7 +719,7 @@ def create_argument_parser():
|
|||||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
||||||
)
|
)
|
||||||
|
|
||||||
# hf dtaset
|
# hf dataset
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
||||||
)
|
)
|
||||||
|
|||||||
@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
|
|||||||
if not all_the_same(trace_eles)), None)
|
if not all_the_same(trace_eles)), None)
|
||||||
|
|
||||||
if first_trace_difference is None:
|
if first_trace_difference is None:
|
||||||
# can't create a unique name, leave them names as the
|
# can't create a unique name, leave the names as they
|
||||||
# are they will get aggregated by the pivot_table call
|
# are they will get aggregated by the pivot_table call
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@ -513,7 +513,7 @@ if flashinfer_comm is not None:
|
|||||||
torch.ops._C.static_scaled_fp8_quant(
|
torch.ops._C.static_scaled_fp8_quant(
|
||||||
quant_out, norm_out, scale_factor)
|
quant_out, norm_out, scale_factor)
|
||||||
if scale_factor is None or norm_out is not None:
|
if scale_factor is None or norm_out is not None:
|
||||||
# we need to return allreduce outpput
|
# we need to return allreduce output
|
||||||
# in cases of non quant fused AR + RMS norm
|
# in cases of non quant fused AR + RMS norm
|
||||||
# and fused AR + RMS norm + quant without fused add
|
# and fused AR + RMS norm + quant without fused add
|
||||||
allreduce_in.copy_(allreduce_out)
|
allreduce_in.copy_(allreduce_out)
|
||||||
|
|||||||
@ -49,7 +49,7 @@ class MQLLMEngine:
|
|||||||
|
|
||||||
This class is used to wrap the
|
This class is used to wrap the
|
||||||
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
|
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
|
||||||
in concurrnet manner. It runs a background loop and uses zeromq to
|
in concurrent manner. It runs a background loop and uses zeromq to
|
||||||
receive new requests and stream outputs incrementally via ipc.
|
receive new requests and stream outputs incrementally via ipc.
|
||||||
|
|
||||||
The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
|
The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
|
||||||
|
|||||||
@ -23,7 +23,7 @@ TORCH_DEVICE_IDENTITY = None
|
|||||||
# The condition to determine if it is on a platform that supports
|
# The condition to determine if it is on a platform that supports
|
||||||
# torch._scaled_mm rowwise feature.
|
# torch._scaled_mm rowwise feature.
|
||||||
# The condition is determined once as the operations
|
# The condition is determined once as the operations
|
||||||
# are time consuming.
|
# are time-consuming.
|
||||||
USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
|
USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
|
||||||
torch.__version__) >= version.parse("2.7")
|
torch.__version__) >= version.parse("2.7")
|
||||||
and current_platform.has_device_capability(94))
|
and current_platform.has_device_capability(94))
|
||||||
|
|||||||
@ -211,7 +211,7 @@ class DefaultModelLoader(BaseModelLoader):
|
|||||||
|
|
||||||
if not USE_TPU_COMMONS:
|
if not USE_TPU_COMMONS:
|
||||||
# In PyTorch XLA, we should call `xm.mark_step`
|
# In PyTorch XLA, we should call `xm.mark_step`
|
||||||
# requently so that not too many ops are accumulated
|
# frequently so that not too many ops are accumulated
|
||||||
# in the XLA program. import torch_xla.core.xla_model
|
# in the XLA program. import torch_xla.core.xla_model
|
||||||
# as xm
|
# as xm
|
||||||
import torch_xla.core.xla_model as xm
|
import torch_xla.core.xla_model as xm
|
||||||
|
|||||||
@ -84,7 +84,7 @@ class XPUWorker(Worker):
|
|||||||
"""Profiles the peak memory usage of the model to determine how many
|
"""Profiles the peak memory usage of the model to determine how many
|
||||||
KV blocks may be allocated without OOMs.
|
KV blocks may be allocated without OOMs.
|
||||||
The engine will first conduct a profiling of the existing memory usage.
|
The engine will first conduct a profiling of the existing memory usage.
|
||||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
Then, it calculates the maximum possible number of GPU and CPU blocks
|
||||||
that can be allocated with the remaining free memory.
|
that can be allocated with the remaining free memory.
|
||||||
.. tip::
|
.. tip::
|
||||||
You may limit the usage of GPU memory
|
You may limit the usage of GPU memory
|
||||||
|
|||||||
@ -234,7 +234,7 @@ class Worker(LocalOrDistributedWorkerBase):
|
|||||||
KV blocks may be allocated without OOMs.
|
KV blocks may be allocated without OOMs.
|
||||||
|
|
||||||
The engine will first conduct a profiling of the existing memory usage.
|
The engine will first conduct a profiling of the existing memory usage.
|
||||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
Then, it calculates the maximum possible number of GPU and CPU blocks
|
||||||
that can be allocated with the remaining free memory.
|
that can be allocated with the remaining free memory.
|
||||||
|
|
||||||
Tip:
|
Tip:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user