diff --git a/cogvideox_fun/pipeline_cogvideox_inpaint.py b/cogvideox_fun/pipeline_cogvideox_inpaint.py index 7b9d8e7..a6f0e9e 100644 --- a/cogvideox_fun/pipeline_cogvideox_inpaint.py +++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py @@ -739,6 +739,8 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline): num_channels_transformer = self.transformer.config.in_channels return_image_latents = num_channels_transformer == num_channels_latents + self.vae.to(device) + latents_outputs = self.prepare_latents( batch_size * num_videos_per_prompt, num_channels_latents, @@ -840,6 +842,9 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline): mask = rearrange(mask, "b c f h w -> b f c h w") inpaint_latents = None + + self.vae.to(torch.device("cpu")) + if comfyui_progressbar: pbar.update(1) diff --git a/model_loading.py b/model_loading.py index d1482d3..e627351 100644 --- a/model_loading.py +++ b/model_loading.py @@ -121,7 +121,7 @@ class DownloadAndLoadCogVideoModel: "precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"} ), - "fp8_transformer": (['disabled', 'enabled', 'fastmode', 'torchao_fp8dq', "torchao_fp6"], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}), + "fp8_transformer": (['disabled', 'enabled', 'fastmode', 'torchao_fp8dq', "torchao_fp8dqrow", "torchao_int8dq", "torchao_fp6"], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}), "compile": (["disabled","onediff","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}), "enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}), "block_edit": ("TRANSFORMERBLOCKS", {"default": None}), @@ -301,7 +301,8 @@ class DownloadAndLoadCogVideoModel: from torchao.quantization import ( quantize_, fpx_weight_only, - float8_dynamic_activation_float8_weight + float8_dynamic_activation_float8_weight, + int8_dynamic_activation_int8_weight ) except: raise ImportError("torchao is not installed, please install torchao to use fp8dq") @@ -316,11 +317,16 @@ class DownloadAndLoadCogVideoModel: quant_func = fpx_weight_only(3, 2) elif "fp8dq" in fp8_transformer: #very fast on 4090 when compiled quant_func = float8_dynamic_activation_float8_weight() + elif 'fp8dqrow' in fp8_transformer: + from torchao.quantization.quant_api import PerRow + quant_func = float8_dynamic_activation_float8_weight(granularity=PerRow()) + elif 'int8dq' in fp8_transformer: + quant_func = int8_dynamic_activation_int8_weight() for i, block in enumerate(pipe.transformer.transformer_blocks): if "CogVideoXBlock" in str(block): quantize_(block, quant_func, filter_fn=filter_fn) - + manual_offloading = False # to disable manual .to(device) calls if enable_sequential_cpu_offload: diff --git a/nodes.py b/nodes.py index dd9589a..b18a978 100644 --- a/nodes.py +++ b/nodes.py @@ -1100,10 +1100,6 @@ class CogVideoXFunSampler: base_path = pipeline["base_path"] assert "fun" in base_path.lower(), "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'" assert "pose" not in base_path.lower(), "'Pose' models not supported in 'CogVideoXFunSampler', use the 'CogVideoXFunControlSampler'" - - - if not pipeline["cpu_offloading"]: - pipe.enable_model_cpu_offload(device=device) mm.soft_empty_cache() @@ -1123,8 +1119,8 @@ class CogVideoXFunSampler: else: raise ValueError(f"Unknown scheduler: {scheduler}") - #if not pipeline["cpu_offloading"]: - # pipe.transformer.to(device) + if not pipeline["cpu_offloading"] and pipeline["manual_offloading"]: + pipe.transformer.to(device) if context_options is not None: context_frames = context_options["context_frames"] // 4 @@ -1184,8 +1180,8 @@ class CogVideoXFunSampler: noise_aug_strength = noise_aug_strength, strength = vid2vid_denoise, ) - #if not pipeline["cpu_offloading"]: - # pipe.transformer.to(offload_device) + if not pipeline["cpu_offloading"] and pipeline["manual_offloading"]: + pipe.transformer.to(offload_device) #clear FasterCache if fastercache is not None: for block in pipe.transformer.transformer_blocks: