update examples, expose scheduler, force T5 offload

2026-06-30 16:47:05 +08:00 · 2024-08-07 01:10:19 +03:00 · 2024-08-07 01:10:19 +03:00 · 97e89d596e
commit 97e89d596e
parent 8a0af3b663
4 changed files with 436 additions and 410 deletions
--- a/examples/cogvideo_vid2vid_test_example_01.json
+++ b/examples/cogvideo_vid2vid_test_example_01.json
@ -1,46 +1,7 @@
 {
-  "last_node_id": 59,
+  "last_node_id": 64,
-  "last_link_id": 137,
+  "last_link_id": 167,
  "nodes": [
    {
      "id": 31,
      "type": "CogVideoTextEncode",
      "pos": [
        503,
        521
      ],
      "size": {
        "0": 463.01251220703125,
        "1": 98.10446166992188
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 56
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            80
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 1,
      "type": "DownloadAndLoadCogVideoModel",
@ -60,8 +21,8 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            78,
+            83,
-            83
+            159
          ],
          "shape": 3,
          "slot_index": 0
@ -108,47 +69,6 @@
        "sd3"
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
        1199,
        661
      ],
      "size": {
        "0": 210,
        "1": 46
      },
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 81
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 82
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            118
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    },
    {
      "id": 56,
      "type": "SimpleMath+",
@ -233,7 +153,7 @@
          "name": "samples",
          "type": "LATENT",
          "links": [
-            122
+            162
          ],
          "shape": 3,
          "slot_index": 0
@ -301,84 +221,6 @@
        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
    {
      "id": 41,
      "type": "ImageResizeKJ",
      "pos": [
        315,
        -19
      ],
      "size": {
        "0": 315,
        "1": 242
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 128
        },
        {
          "name": "get_image_size",
          "type": "IMAGE",
          "link": null
        },
        {
          "name": "width_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "width_input"
          }
        },
        {
          "name": "height_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "height_input"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            126
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "height",
          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
        720,
        480,
        "lanczos",
        false,
        2,
        0,
        0
      ]
    },
    {
      "id": 59,
      "type": "GetImageRangeFromBatch",
@ -448,10 +290,10 @@
        1451,
        368
      ],
-      "size": [
+      "size": {
-        315,
+        "0": 315,
-        102
+        "1": 102
-      ],
+      },
      "flags": {
        "collapsed": true
      },
@ -552,12 +394,12 @@
      "id": 47,
      "type": "VHS_VideoCombine",
      "pos": [
-        1789,
+        1790,
        -104
      ],
      "size": [
-        1113.3311767578125,
+        1110,
-        712.4437255859375
+        711.3333333333333
      ],
      "flags": {},
      "order": 15,
@ -610,7 +452,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "AnimateDiff_00011.mp4",
+            "filename": "AnimateDiff_00008.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/nvenc_h264-mp4",
@ -619,6 +461,190 @@
        }
      }
    },
    {
      "id": 57,
      "type": "GetImageSizeAndCount",
      "pos": [
        674,
        2
      ],
      "size": {
        "0": 210,
        "1": 86
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 126,
          "slot_index": 0
        }
      ],
      "outputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "links": [
            129,
            136
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "720 width",
          "type": "INT",
          "links": [
            165
          ],
          "shape": 3,
          "slot_index": 1
        },
        {
          "name": "480 height",
          "type": "INT",
          "links": [
            164
          ],
          "shape": 3,
          "slot_index": 2
        },
        {
          "name": "16 count",
          "type": "INT",
          "links": [
            163
          ],
          "shape": 3,
          "slot_index": 3
        }
      ],
      "properties": {
        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
    {
      "id": 41,
      "type": "ImageResizeKJ",
      "pos": [
        315,
        -19
      ],
      "size": {
        "0": 315,
        "1": 242
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 128
        },
        {
          "name": "get_image_size",
          "type": "IMAGE",
          "link": null
        },
        {
          "name": "width_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "width_input"
          }
        },
        {
          "name": "height_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "height_input"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            126
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "height",
          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
        720,
        480,
        "lanczos",
        false,
        2,
        0,
        0
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
        1201,
        684
      ],
      "size": {
        "0": 210,
        "1": 46
      },
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 166
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 167
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            118
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
@ -645,7 +671,7 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            79
+            160
          ],
          "shape": 3,
          "slot_index": 0
@ -655,84 +681,46 @@
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        "video of dinosaur turning it's head in a cinematic and dramatic scene from a movie"
+        "cinematic video of a red panda turning it's head"
      ]
    },
    {
-      "id": 36,
+      "id": 31,
-      "type": "CogVideoSampler",
+      "type": "CogVideoTextEncode",
      "pos": [
-        1093,
+        503,
-        292
+        521
      ],
      "size": [
        315,
        310
      ],
      "size": {
        "0": 463.01251220703125,
        "1": 98.10446166992188
      },
      "flags": {},
-      "order": 8,
+      "order": 4,
      "mode": 0,
      "inputs": [
        {
-          "name": "pipeline",
+          "name": "clip",
-          "type": "COGVIDEOPIPE",
+          "type": "CLIP",
-          "link": 78
+          "link": 56
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 79
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 80
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 122
        },
        {
          "name": "num_frames",
          "type": "INT",
          "link": 137,
          "widget": {
            "name": "num_frames"
          }
        }
      ],
      "outputs": [
        {
-          "name": "cogvideo_pipe",
+          "name": "conditioning",
-          "type": "COGVIDEOPIPE",
+          "type": "CONDITIONING",
          "links": [
-            81
+            161
          ],
-          "shape": 3
+          "shape": 3,
-        },
+          "slot_index": 0
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            82
          ],
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "CogVideoSampler"
+        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        480,
+        "bad quality video, blurry, messy"
        720,
        16,
        8,
        25,
        8,
        1119546789766856,
        "fixed",
        0.8
      ]
    },
    {
@ -819,63 +807,98 @@
      }
    },
    {
-      "id": 57,
+      "id": 64,
-      "type": "GetImageSizeAndCount",
+      "type": "CogVideoSampler",
      "pos": [
-        674,
+        1090,
-        2
+        290
      ],
      "size": {
-        "0": 210,
+        "0": 315,
-        "1": 86
+        "1": 342
      },
      "flags": {},
-      "order": 6,
+      "order": 8,
      "mode": 0,
      "inputs": [
        {
-          "name": "image",
+          "name": "pipeline",
-          "type": "IMAGE",
+          "type": "COGVIDEOPIPE",
-          "link": 126,
+          "link": 159
-          "slot_index": 0
+        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 160
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 161
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 162
        },
        {
          "name": "num_frames",
          "type": "INT",
          "link": 163,
          "widget": {
            "name": "num_frames"
          }
        },
        {
          "name": "height",
          "type": "INT",
          "link": 164,
          "widget": {
            "name": "height"
          }
        },
        {
          "name": "width",
          "type": "INT",
          "link": 165,
          "widget": {
            "name": "width"
          }
        }
      ],
      "outputs": [
        {
-          "name": "image",
+          "name": "cogvideo_pipe",
-          "type": "IMAGE",
+          "type": "COGVIDEOPIPE",
          "links": [
-            129,
+            166
            136
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "720 width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
-          "name": "480 height",
+          "name": "samples",
-          "type": "INT",
+          "type": "LATENT",
          "links": null,
          "shape": 3
        },
        {
          "name": "16 count",
          "type": "INT",
          "links": [
-            137
+            167
          ],
-          "shape": 3,
+          "shape": 3
          "slot_index": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "GetImageSizeAndCount"
+        "Node name for S&R": "CogVideoSampler"
-      }
+      },
      "widgets_values": [
        480,
        720,
        48,
        8,
        35,
        9,
        6,
        "fixed",
        "DPM",
        0.7000000000000001
      ]
    }
  ],
  "links": [
@ -895,46 +918,6 @@
      0,
      "CLIP"
    ],
    [
      78,
      1,
      0,
      36,
      0,
      "COGVIDEOPIPE"
    ],
    [
      79,
      30,
      0,
      36,
      1,
      "CONDITIONING"
    ],
    [
      80,
      31,
      0,
      36,
      2,
      "CONDITIONING"
    ],
    [
      81,
      36,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      82,
      36,
      1,
      11,
      1,
      "LATENT"
    ],
    [
      83,
      1,
@ -975,14 +958,6 @@
      0,
      "INT,FLOAT"
    ],
    [
      122,
      37,
      0,
      36,
      3,
      "LATENT"
    ],
    [
      126,
      41,
@ -1048,22 +1023,86 @@
      "IMAGE"
    ],
    [
-      137,
+      159,
      1,
      0,
      64,
      0,
      "COGVIDEOPIPE"
    ],
    [
      160,
      30,
      0,
      64,
      1,
      "CONDITIONING"
    ],
    [
      161,
      31,
      0,
      64,
      2,
      "CONDITIONING"
    ],
    [
      162,
      37,
      0,
      64,
      3,
      "LATENT"
    ],
    [
      163,
      57,
      3,
-      36,
+      64,
      4,
      "INT"
    ],
    [
      164,
      57,
      2,
      64,
      5,
      "INT"
    ],
    [
      165,
      57,
      1,
      64,
      6,
      "INT"
    ],
    [
      166,
      64,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      167,
      64,
      1,
      11,
      1,
      "LATENT"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.7513148009015777,
+      "scale": 0.6830134553650705,
      "offset": [
-        45.633655208726886,
+        56.628416841109384,
-        389.8041242612087
+        394.7727729054069
      ]
    }
  },
--- a/examples/example_01.json
+++ b/examples/example_01.json
@ -11,7 +11,7 @@
      ],
      "size": {
        "0": 315,
-        "1": 266
+        "1": 334
      },
      "flags": {},
      "order": 4,
@ -32,6 +32,11 @@
          "name": "negative",
          "type": "CONDITIONING",
          "link": 57
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": null
        }
      ],
      "outputs": [
@ -63,50 +68,11 @@
        25,
        6,
        806286757407561,
-        "fixed"
+        "fixed",
        "DDIM",
        1
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
        1142,
        658
      ],
      "size": {
        "0": 210,
        "1": 46
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 37
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 38
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            51
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    },
    {
      "id": 28,
      "type": "VHS_VideoCombine",
@ -169,7 +135,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "AnimateDiff_00001.mp4",
+            "filename": "CogVideoX_00001.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
@ -185,10 +151,10 @@
        500,
        308
      ],
-      "size": [
+      "size": {
-        474.84501511852204,
+        "0": 474.8450012207031,
-        164.74235966960538
+        "1": 164.7423553466797
-      ],
+      },
      "flags": {},
      "order": 2,
      "mode": 0,
@ -258,10 +224,10 @@
        503,
        521
      ],
-      "size": [
+      "size": {
-        463.01251866466464,
+        "0": 463.01251220703125,
-        98.10446321574796
+        "1": 98.10446166992188
-      ],
+      },
      "flags": {},
      "order": 3,
      "mode": 0,
@ -321,6 +287,47 @@
      "widgets_values": [
        "fp16"
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
        1138,
        725
      ],
      "size": {
        "0": 210,
        "1": 46
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 37
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 38
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            51
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    }
  ],
  "links": [
@ -393,10 +400,10 @@
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.6830134553650706,
+      "scale": 0.9090909090909092,
      "offset": [
-        359.4381777891929,
+        12.99028921497383,
-        334.95283678425216
+        38.21608107136124
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -2,7 +2,7 @@ import os
 import torch
 import folder_paths
 import comfy.model_management as mm
-
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
 from .pipeline_cogvideox import CogVideoXPipeline
 import logging
@ -54,11 +54,11 @@ class DownloadAndLoadCogVideoModel:
            )
        pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
        pipeline = {
            "pipe": pipe,
-            "dtype": dtype
+            "dtype": dtype,
            "base_path": base_path
        }
        return (pipeline,)
@ -115,11 +115,15 @@ class CogVideoTextEncode:
    CATEGORY = "CogVideoWrapper"
    def process(self, clip, prompt):
        load_device = mm.text_encoder_device()
        offload_device = mm.text_encoder_offload_device()
        clip.tokenizer.t5xxl.pad_to_max_length = True
        clip.tokenizer.t5xxl.max_length = 226
        clip.cond_stage_model.to(load_device)
        tokens = clip.tokenize(prompt, return_word_ids=True)
        embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
        clip.cond_stage_model.to(offload_device)
        return (embeds, )
@ -194,6 +198,7 @@ class CogVideoSampler:
                "steps": ("INT", {"default": 25, "min": 1}),
                "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
                "scheduler": (["DDIM", "DPM"],),
            },
            "optional": {
                "samples": ("LATENT", ),
@ -206,16 +211,22 @@ class CogVideoSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"
-    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, samples=None, denoise_strength=1.0):
+    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
        mm.soft_empty_cache()
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        pipe = pipeline["pipe"]
        dtype = pipeline["dtype"]
        base_path = pipeline["base_path"]
        pipe.transformer.to(device)
        generator = torch.Generator(device=device).manual_seed(seed)
        if scheduler == "DDIM":
            pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
        elif scheduler == "DPM":
            pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
        latents = pipeline["pipe"](
            num_inference_steps=steps,
            height = height,
@ -227,7 +238,6 @@ class CogVideoSampler:
            denoise_strength=denoise_strength,
            prompt_embeds=positive.to(dtype).to(device),
            negative_prompt_embeds=negative.to(dtype).to(device),
            #negative_prompt_embeds=torch.zeros_like(embeds),
            generator=generator,
            output_type="latents",
            device=device
@ -264,11 +274,10 @@ class CogVideoDecode:
        if "num_frames" in pipeline:
            num_frames = pipeline["num_frames"]
            fps = pipeline["fps"]
        else:
            num_frames = latents.shape[2]
            fps = 8
        num_seconds = num_frames // fps
        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
        latents = 1 / vae.config.scaling_factor * latents
@ -278,17 +287,14 @@ class CogVideoDecode:
            # Whether or not to clear fake context parallel cache
            fake_cp = i + 1 < num_seconds
            start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
            current_frames = vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
            frames.append(current_frames)
            mm.soft_empty_cache()
        vae.to(offload_device)
        frames = torch.cat(frames, dim=2)
        print(frames.min(), frames.max())
        video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
        print(video.shape)
        video = video[0].permute(0, 2, 3, 1).cpu().float()
        print(video.min(), video.max())
        return (video,)
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -222,22 +222,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        latents = latents * self.scheduler.init_noise_sigma
        return latents, timesteps
    def decode_latents(self, latents: torch.Tensor, num_seconds: int):
        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
        latents = 1 / self.vae.config.scaling_factor * latents
        frames = []
        for i in range(num_seconds):
            # Whether or not to clear fake context parallel cache
            fake_cp = i + 1 < num_seconds
            start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
            current_frames = self.vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
            frames.append(current_frames)
        frames = torch.cat(frames, dim=2)
        return frames
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@ -534,17 +518,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                    progress_bar.update()
                    comfy_pbar.update(1)
        if not output_type == "latents":
            video = self.decode_latents(latents, num_frames // fps)
            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
        else:
            video = latents
            print(video.shape)
        # Offload all models
        self.maybe_free_model_hooks()
        if not return_dict:
            return (video,)
        return latents
        #return CogVideoXPipelineOutput(frames=video)