disable inductor, disable piecewise

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith 2025-01-03 15:05:52 -05:00
parent 4024253797
commit 22bd7296e4
2 changed files with 4 additions and 8 deletions

View File

@ -3174,11 +3174,13 @@ class VllmConfig:
# and avoid any potential issues with the inductor.
self.compilation_config.custom_ops = ["none"]
self.compilation_config.use_cudagraph = True
self.compilation_config.use_inductor = True
# self.compilation_config.use_inductor = True
self.compilation_config.use_inductor = False
self.compilation_config.cudagraph_num_of_warmups = 1
self.compilation_config.pass_config.enable_fusion = False
self.compilation_config.pass_config.enable_reshape = False
self.compilation_config.level = CompilationLevel.PIECEWISE
# self.compilation_config.level = CompilationLevel.PIECEWISE
self.compilation_config.level = CompilationLevel.NO_COMPILATION
self._set_cudagraph_sizes()

View File

@ -840,12 +840,6 @@ class GPUModelRunner:
gc.collect()
def capture_model(self) -> None:
if not self.use_cuda_graph:
logger.warning(
"Skipping CUDA graph capture. Please add "
"-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE)
return
start_time = time.perf_counter()
start_free_gpu_memory = torch.cuda.mem_get_info()[0]