diff --git a/vllm/envs.py b/vllm/envs.py index 1104f108784f6..8be9ebb95dded 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -360,8 +360,9 @@ environment_variables: Dict[str, Callable[[], Any]] = { # Enables weights compression during model export via HF Optimum # default is False "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS": - lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)), - + lambda: + (os.environ.get("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", "0").lower() in + ("on", "true", "1")), # If the env var is set, then all workers will execute as separate # processes from the engine, and we use the same mechanism to trigger # execution on all workers. diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index fde200d576e2f..805f0cfc585e3 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -125,7 +125,8 @@ class OpenVINOCausalLM(nn.Module): "as-is, all possible options that may affect model conversion " "are ignored.") - load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS + load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS + if export else False) pt_model = OVModelForCausalLM.from_pretrained( model_config.model, export=export,