mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-06 21:24:34 +08:00
[Chore] Fix torch precision warning (#30428)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
7e24e5d4d6
commit
d6464f2679
@ -152,8 +152,8 @@ def run_tests(
|
|||||||
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
|
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
|
||||||
else:
|
else:
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||||
# lock matmul precision to full FP32
|
# lock matmul precision to full FP32 (IEEE)
|
||||||
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
|
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
|
||||||
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
||||||
outputs: list[tuple[str, list, list]] = []
|
outputs: list[tuple[str, list, list]] = []
|
||||||
for n, (
|
for n, (
|
||||||
|
|||||||
10
vllm/envs.py
10
vllm/envs.py
@ -74,7 +74,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_MEDIA_CONNECTOR: str = "http"
|
VLLM_MEDIA_CONNECTOR: str = "http"
|
||||||
VLLM_TARGET_DEVICE: str = "cuda"
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
VLLM_MAIN_CUDA_VERSION: str = "12.9"
|
VLLM_MAIN_CUDA_VERSION: str = "12.9"
|
||||||
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
|
VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee"
|
||||||
MAX_JOBS: str | None = None
|
MAX_JOBS: str | None = None
|
||||||
NVCC_THREADS: str | None = None
|
NVCC_THREADS: str | None = None
|
||||||
VLLM_USE_PRECOMPILED: bool = False
|
VLLM_USE_PRECOMPILED: bool = False
|
||||||
@ -456,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
|
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
|
||||||
or "12.9",
|
or "12.9",
|
||||||
# Controls PyTorch float32 matmul precision mode within vLLM workers.
|
# Controls PyTorch float32 matmul precision mode within vLLM workers.
|
||||||
# Valid options mirror torch.set_float32_matmul_precision
|
# Accepted values:
|
||||||
|
# - "ieee" (default): force full IEEE FP32 matmul precision.
|
||||||
|
# - "tf32": enable TensorFloat32-based fast matmul.
|
||||||
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
|
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
|
||||||
"VLLM_FLOAT32_MATMUL_PRECISION",
|
"VLLM_FLOAT32_MATMUL_PRECISION",
|
||||||
"highest",
|
"ieee",
|
||||||
["highest", "high", "medium"],
|
["ieee", "tf32"],
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
),
|
),
|
||||||
# Maximum number of compilation jobs to run in parallel.
|
# Maximum number of compilation jobs to run in parallel.
|
||||||
|
|||||||
@ -81,7 +81,7 @@ class Worker(WorkerBase):
|
|||||||
|
|
||||||
# configure float32 matmul precision according to vLLM env.
|
# configure float32 matmul precision according to vLLM env.
|
||||||
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
|
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
|
||||||
torch.set_float32_matmul_precision(precision)
|
torch.backends.cuda.matmul.fp32_precision = precision
|
||||||
|
|
||||||
if self.model_config.trust_remote_code:
|
if self.model_config.trust_remote_code:
|
||||||
# note: lazy import to avoid importing torch before initializing
|
# note: lazy import to avoid importing torch before initializing
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user