mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-01 04:27:03 +08:00
Clean code
Signed-off-by: Yu Gong <yu3.gong@gmail.com>
This commit is contained in:
parent
ea1a26d276
commit
aa7917aaaa
@ -109,14 +109,6 @@ class CudagraphDispatcher:
|
||||
|
||||
max_loras = self.vllm_config.lora_config.max_loras
|
||||
|
||||
# When speculative decoding is enabled, only capture with max_loras
|
||||
# to avoid torch.compile conflicts during CUDA graph capture
|
||||
if self.vllm_config.speculative_config is not None:
|
||||
lora_cases = [(True, max_loras)]
|
||||
if self.compilation_config.cudagraph_specialize_lora:
|
||||
lora_cases.append((False, 0))
|
||||
return lora_cases
|
||||
|
||||
# Capture for each num_active_loras from 1 to max_loras
|
||||
lora_cases = [(True, n) for n in range(1, max_loras + 1)]
|
||||
# Also capture the no-lora case
|
||||
|
||||
@ -4598,24 +4598,6 @@ class GPUModelRunner(
|
||||
self.encoder_cache.clear()
|
||||
gc.collect()
|
||||
|
||||
def _get_lora_capture_cases(self) -> list[tuple[bool, int]]:
|
||||
"""
|
||||
Returns list of (has_lora, num_active_loras) tuples for CUDA graph capture.
|
||||
|
||||
Returns cases for each num_active_loras from 1 to max_loras.
|
||||
If cudagraph_specialize_lora is True, also includes the no-lora case.
|
||||
"""
|
||||
if not self.lora_config:
|
||||
return [(False, 0)]
|
||||
|
||||
max_loras = self.lora_config.max_loras
|
||||
# Capture for each num_active_loras from 1 to max_loras
|
||||
lora_cases = [(True, n) for n in range(1, max_loras + 1)]
|
||||
# Also capture the no-lora case if cudagraph_specialize_lora is True
|
||||
if self.compilation_config.cudagraph_specialize_lora:
|
||||
lora_cases.append((False, 0))
|
||||
return lora_cases
|
||||
|
||||
def capture_model(self) -> int:
|
||||
if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
||||
logger.warning(
|
||||
@ -4654,7 +4636,7 @@ class GPUModelRunner(
|
||||
assert cudagraph_mode is not None
|
||||
|
||||
# Build LoRA cases: list of (has_lora, num_active_loras) tuples
|
||||
lora_cases = self._get_lora_capture_cases()
|
||||
lora_cases = self.cudagraph_dispatcher._get_lora_cases()
|
||||
|
||||
if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
|
||||
cudagraph_runtime_mode = cudagraph_mode.mixed_mode()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user