From b735255f1743aaa18b298ea5ef68689115d43930 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Thu, 18 Dec 2025 16:22:30 -0800 Subject: [PATCH] improve cpu tests for 0.12.0 Signed-off-by: Tsai, Louie --- .../tests/serving-tests-cpu.json | 97 +++++++++++++++---- 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index 8f7200862d20c..1b031a2717610 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -19,10 +19,8 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "enforce_eager": "", "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" + "max_num_seqs": 256 }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", @@ -35,7 +33,8 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "sharegpt", @@ -45,7 +44,8 @@ { "test_name": "serving_llama8B_tp2_sharegpt", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "sharegpt", @@ -55,7 +55,8 @@ { "test_name": "serving_llama8B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -66,7 +67,8 @@ { "test_name": "serving_llama8B_tp2_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -77,7 +79,8 @@ { "test_name": "serving_llama8B_tp4_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "tensor_parallel_size": 4, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -88,7 +91,8 @@ { "test_name": "serving_llama8B_tp1_random_128_2048", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -99,7 +103,8 @@ { "test_name": "serving_llama8B_tp2_random_128_2048", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -110,7 +115,8 @@ { "test_name": "serving_llama8B_tp4_random_128_2048", "server_parameters": { - "tensor_parallel_size": 4 + "tensor_parallel_size": 4, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -121,7 +127,8 @@ { "test_name": "serving_llama8B_tp1_random_2048_128", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -132,7 +139,8 @@ { "test_name": "serving_llama8B_tp2_random_2048_128", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -143,7 +151,8 @@ { "test_name": "serving_llama8B_tp4_random_2048_128", "server_parameters": { - "tensor_parallel_size": 4 + "tensor_parallel_size": 4, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -151,11 +160,51 @@ "random-output-len": 128 } }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 2 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 4 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, { "test_name": "serving_llama3B_tp1_random_128_128", "server_parameters": { "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "meta-llama/Llama-3.2-3B-Instruct", @@ -168,7 +217,8 @@ "test_name": "serving_granite2B_tp1_random_128_128", "server_parameters": { "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "ibm-granite/granite-3.2-2b-instruct", @@ -181,7 +231,8 @@ "test_name": "serving_qwen1.7B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "Qwen/Qwen3-1.7B", @@ -194,7 +245,8 @@ "test_name": "serving_qwen4B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "Qwen/Qwen3-4B", @@ -207,7 +259,8 @@ "test_name": "serving_qwen8B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "Qwen/Qwen3-8B", @@ -220,7 +273,8 @@ "test_name": "serving_glm9B_tp1_random_128_128", "server_parameters": { "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "zai-org/glm-4-9b-hf", @@ -233,7 +287,8 @@ "test_name": "serving_gemma7B_tp1_random_128_128", "server_parameters": { "model": "google/gemma-7b", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "google/gemma-7b",