update

2025-12-09 04:44:22 +08:00 · 2024-11-17 00:48:01 +02:00 · 2024-11-17 00:48:01 +02:00 · 15aa68c95d
commit 15aa68c95d
parent 4374273138
3 changed files with 18 additions and 11 deletions
--- a/cogvideox_fun/pipeline_cogvideox_inpaint.py
+++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py
@ -739,6 +739,8 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
        num_channels_transformer = self.transformer.config.in_channels
        return_image_latents = num_channels_transformer == num_channels_latents

+        self.vae.to(device)
+
        latents_outputs = self.prepare_latents(
            batch_size * num_videos_per_prompt,
            num_channels_latents,
@ -840,6 +842,9 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
                mask = rearrange(mask, "b c f h w -> b f c h w")

                inpaint_latents = None
+
+        self.vae.to(torch.device("cpu"))
+
        if comfyui_progressbar:
            pbar.update(1)

--- a/model_loading.py
+++ b/model_loading.py
@ -121,7 +121,7 @@ class DownloadAndLoadCogVideoModel:
                "precision": (["fp16", "fp32", "bf16"],
                    {"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"}
                ),
-                "fp8_transformer": (['disabled', 'enabled', 'fastmode', 'torchao_fp8dq', "torchao_fp6"], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}),
+                "fp8_transformer": (['disabled', 'enabled', 'fastmode', 'torchao_fp8dq', "torchao_fp8dqrow", "torchao_int8dq", "torchao_fp6"], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}),
                "compile": (["disabled","onediff","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}),
                "enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}),
                "block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
@ -301,7 +301,8 @@ class DownloadAndLoadCogVideoModel:
                from torchao.quantization import (
                quantize_,
                fpx_weight_only,
-                float8_dynamic_activation_float8_weight
+                float8_dynamic_activation_float8_weight,
+                int8_dynamic_activation_int8_weight
            )
            except:
                raise ImportError("torchao is not installed, please install torchao to use fp8dq")
@ -316,6 +317,11 @@ class DownloadAndLoadCogVideoModel:
                quant_func = fpx_weight_only(3, 2)
            elif "fp8dq" in fp8_transformer: #very fast on 4090 when compiled
                quant_func = float8_dynamic_activation_float8_weight()
+            elif 'fp8dqrow' in fp8_transformer:
+                from torchao.quantization.quant_api import PerRow
+                quant_func = float8_dynamic_activation_float8_weight(granularity=PerRow())
+            elif 'int8dq' in fp8_transformer:
+                quant_func = int8_dynamic_activation_int8_weight()
        
            for i, block in enumerate(pipe.transformer.transformer_blocks):
                if "CogVideoXBlock" in str(block):
--- a/nodes.py
+++ b/nodes.py
@ -1101,10 +1101,6 @@ class CogVideoXFunSampler:
        assert "fun" in base_path.lower(), "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'"
        assert "pose" not in base_path.lower(), "'Pose' models not supported in 'CogVideoXFunSampler', use the 'CogVideoXFunControlSampler'"

-
-        if not pipeline["cpu_offloading"]:
-            pipe.enable_model_cpu_offload(device=device)
-
        mm.soft_empty_cache()

        #vid2vid
@ -1123,8 +1119,8 @@ class CogVideoXFunSampler:
        else:
            raise ValueError(f"Unknown scheduler: {scheduler}")

-        #if not pipeline["cpu_offloading"]:
-        #    pipe.transformer.to(device)
+        if not pipeline["cpu_offloading"] and pipeline["manual_offloading"]:
+            pipe.transformer.to(device)

        if context_options is not None:
            context_frames = context_options["context_frames"] // 4
@ -1184,8 +1180,8 @@ class CogVideoXFunSampler:
                noise_aug_strength = noise_aug_strength,
                strength = vid2vid_denoise,
            )
-        #if not pipeline["cpu_offloading"]:
-        #     pipe.transformer.to(offload_device)
+        if not pipeline["cpu_offloading"] and pipeline["manual_offloading"]:
+            pipe.transformer.to(offload_device)
        #clear FasterCache
        if fastercache is not None:
            for block in pipe.transformer.transformer_blocks: