diff --git a/tests/conftest.py b/tests/conftest.py index 74219e40026c..46b8dd1e1df1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,6 +57,57 @@ MODELS_ON_S3 = [ "ArthurZ/Ilama-3.2-1B", "llava-hf/llava-1.5-7b-hf", "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "JackFram/llama-160m", + "ai21labs/Jamba-tiny-random", + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", + "nm-testing/Phi-3-mini-128k-instruct-FP8", + "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", + "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", + "AMead10/Llama-3.2-1B-Instruct-AWQ", + "shuyuej/Llama-3.2-1B-Instruct-GPTQ", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", + "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", + "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8", + "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test", + "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", + "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", + "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", + "neuralmagic/Llama-3.2-1B-quantized.w8a8", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", + "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", + "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", + "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", + "nm-testing/tinyllama-oneshot-w4a16-channel-v2", + "nm-testing/tinyllama-oneshot-w4a16-group128-v2", + "nm-testing/tinyllama-oneshot-w8a16-per-channel", + "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", + "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test", + "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor", + "nm-testing/llama2.c-stories42M-pruned2.4-compressed", ] MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights" diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index ac1be383c15b..18f6f40b32f0 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -27,8 +27,6 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig, from vllm.platforms import current_platform from vllm.utils import PlaceholderModule -logger = init_logger(__name__) - try: from runai_model_streamer import SafetensorsStreamer except (ImportError, OSError): @@ -39,6 +37,8 @@ except (ImportError, OSError): SafetensorsStreamer = runai_model_streamer.placeholder_attr( "SafetensorsStreamer") +logger = init_logger(__name__) + # use system-level temp directory for file locks, so that multiple users # can share the same lock without error. # lock files in the temp directory will be automatically deleted when the