From 1c1bb0bbf20955d346f66bb25d349c1bd9fe6ea2 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Wed, 29 Jan 2025 18:47:30 -0600 Subject: [PATCH] [Misc][MoE] add Deepseek-V3 moe tuning support (#12558) Signed-off-by: Divakar Verma --- benchmarks/kernels/benchmark_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 1fa0da75c79d..5c8bf33afebc 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -450,7 +450,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, def main(args: argparse.Namespace): print(args) - config = AutoConfig.from_pretrained(args.model) + config = AutoConfig.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code) if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k @@ -461,6 +462,11 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] == "DeepseekV3ForCausalLM": + E = config.n_routed_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size else: # Default: Mixtral. E = config.num_local_experts @@ -538,6 +544,7 @@ if __name__ == "__main__": parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--tune", action="store_true") + parser.add_argument("--trust-remote-code", action="store_true") args = parser.parse_args() main(args)