[Kernel] changing fused moe kernel chunk size default to 32k (#7995)

This commit is contained in:
Avshalom Manevich 2024-08-30 11:11:39 +07:00 committed by GitHub
parent 80c7b089b1
commit 34a0e96d46
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -352,7 +352,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
)),
"VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
# If set, vllm will skip the deprecation warnings.
"VLLM_NO_DEPRECATION_WARNING":