mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-03 19:37:57 +08:00
[V1] TPU - Revert to exponential padding by default (#15565)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
This commit is contained in:
parent
dd8a29da99
commit
b2e85e26f4
@ -99,7 +99,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
||||||
VLLM_V0_USE_OUTLINES_CACHE: bool = False
|
VLLM_V0_USE_OUTLINES_CACHE: bool = False
|
||||||
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
|
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
|
||||||
VLLM_TPU_BUCKET_PADDING_GAP: int = 64
|
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@ -648,7 +648,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# 8, we will run forward pass with [16, 24, 32, ...].
|
# 8, we will run forward pass with [16, 24, 32, ...].
|
||||||
"VLLM_TPU_BUCKET_PADDING_GAP":
|
"VLLM_TPU_BUCKET_PADDING_GAP":
|
||||||
lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
|
lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
|
||||||
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 64,
|
if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
# end-env-vars-definition
|
# end-env-vars-definition
|
||||||
|
|||||||
@ -944,18 +944,35 @@ def _get_paddings(min_token_size: int, max_token_size: int,
|
|||||||
padding_gap: int) -> list[int]:
|
padding_gap: int) -> list[int]:
|
||||||
"""Generate a list of padding size, starting from min_token_size,
|
"""Generate a list of padding size, starting from min_token_size,
|
||||||
ending with a number that can cover max_token_size
|
ending with a number that can cover max_token_size
|
||||||
first increase the size to twice,
|
|
||||||
then increase the padding size by padding_gap.
|
If padding_gap == 0 then:
|
||||||
|
increase 2X each time (exponential)
|
||||||
|
else:
|
||||||
|
first increase the size to twice,
|
||||||
|
then increase the padding size by padding_gap.
|
||||||
"""
|
"""
|
||||||
paddings = []
|
paddings = []
|
||||||
num = min_token_size
|
num = min_token_size
|
||||||
while num <= padding_gap:
|
|
||||||
paddings.append(num)
|
if padding_gap == 0:
|
||||||
num *= 2
|
logger.info("Using exponential paddings:")
|
||||||
num //= 2
|
while num <= max_token_size:
|
||||||
while num < max_token_size:
|
logger.info(" %d", num)
|
||||||
num += padding_gap
|
paddings.append(num)
|
||||||
paddings.append(num)
|
num *= 2
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.info("Using incremental paddings:")
|
||||||
|
while num <= padding_gap:
|
||||||
|
logger.info(" %d", num)
|
||||||
|
paddings.append(num)
|
||||||
|
num *= 2
|
||||||
|
num //= 2
|
||||||
|
while num < max_token_size:
|
||||||
|
num += padding_gap
|
||||||
|
logger.info(" %d", num)
|
||||||
|
paddings.append(num)
|
||||||
|
|
||||||
return paddings
|
return paddings
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user