cleanup, update example

2025-12-08 20:34:23 +08:00 · 2024-08-27 18:01:36 +03:00 · 2024-08-27 18:01:36 +03:00 · 06b5e021ad
commit 06b5e021ad
parent e98c428e1e
3 changed files with 137 additions and 150 deletions
--- a/examples/cogvideox_5b_example_01.json
+++ b/examples/cogvideox_5b_example_01.json
@ -1,41 +1,7 @@
 {
-  "last_node_id": 33,
-  "last_link_id": 59,
+  "last_node_id": 34,
+  "last_link_id": 64,
  "nodes": [
-    {
-      "id": 20,
-      "type": "CLIPLoader",
-      "pos": [
-        -59,
-        397
-      ],
-      "size": {
-        "0": 451.30548095703125,
-        "1": 82
-      },
-      "flags": {},
-      "order": 0,
-      "mode": 0,
-      "outputs": [
-        {
-          "name": "CLIP",
-          "type": "CLIP",
-          "links": [
-            54,
-            56
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CLIPLoader"
-      },
-      "widgets_values": [
-        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
-        "sd3"
-      ]
-    },
    {
      "id": 31,
      "type": "CogVideoTextEncode",
@ -62,7 +28,7 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            57
+            62
          ],
          "slot_index": 0,
          "shape": 3
@ -84,7 +50,7 @@
      ],
      "size": {
        "0": 210,
-        "1": 46
+        "1": 78
      },
      "flags": {},
      "order": 5,
@ -93,12 +59,12 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 37
+          "link": 63
        },
        {
          "name": "samples",
          "type": "LATENT",
-          "link": 38
+          "link": 64
        }
      ],
      "outputs": [
@ -114,7 +80,10 @@
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
-      }
+      },
+      "widgets_values": [
+        false
+      ]
    },
    {
      "id": 1,
@ -128,14 +97,14 @@
        "1": 82
      },
      "flags": {},
-      "order": 1,
+      "order": 0,
      "mode": 0,
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            36
+            60
          ],
          "slot_index": 0,
          "shape": 3
@ -150,16 +119,55 @@
      ]
    },
    {
-      "id": 22,
+      "id": 30,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        500,
+        308
+      ],
+      "size": [
+        471.90143257018326,
+        168.0804709842023
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 54
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            61
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n"
+      ]
+    },
+    {
+      "id": 34,
      "type": "CogVideoSampler",
      "pos": [
        1041,
        342
      ],
-      "size": {
-        "0": 315,
-        "1": 382
-      },
+      "size": [
+        315.84047081854465,
+        358
+      ],
      "flags": {},
      "order": 4,
      "mode": 0,
@ -167,18 +175,17 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 36
+          "link": 60
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
-          "link": 55,
-          "slot_index": 1
+          "link": 61
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
-          "link": 57
+          "link": 62
        },
        {
          "name": "samples",
@ -191,7 +198,7 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            37
+            63
          ],
          "shape": 3
        },
@ -199,7 +206,7 @@
          "name": "samples",
          "type": "LATENT",
          "links": [
-            38
+            64
          ],
          "shape": 3
        }
@ -211,9 +218,8 @@
        480,
        720,
        49,
-        8,
        50,
-        7,
+        6,
        806286757407563,
        "fixed",
        "DPM",
@ -226,8 +232,8 @@
      "id": 33,
      "type": "VHS_VideoCombine",
      "pos": [
-        1533,
-        136
+        1441,
+        129
      ],
      "size": [
        778.7022705078125,
@ -284,7 +290,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "CogVideoX5B.mp4",
+            "filename": "CogVideoX5B_00009.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/nvenc_h264-mp4",
@ -295,70 +301,41 @@
      }
    },
    {
-      "id": 30,
-      "type": "CogVideoTextEncode",
+      "id": 20,
+      "type": "CLIPLoader",
      "pos": [
-        500,
-        308
+        -26,
+        400
      ],
      "size": {
-        "0": 474.8450012207031,
-        "1": 164.7423553466797
+        "0": 451.30548095703125,
+        "1": 82
      },
      "flags": {},
-      "order": 2,
+      "order": 1,
      "mode": 0,
-      "inputs": [
-        {
-          "name": "clip",
-          "type": "CLIP",
-          "link": 54
-        }
-      ],
      "outputs": [
        {
-          "name": "conditioning",
-          "type": "CONDITIONING",
+          "name": "CLIP",
+          "type": "CLIP",
          "links": [
-            55
+            54,
+            56
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "CogVideoTextEncode"
+        "Node name for S&R": "CLIPLoader"
      },
      "widgets_values": [
-        "The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from its tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.\n"
+        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
+        "sd3"
      ]
    }
  ],
  "links": [
-    [
-      36,
-      1,
-      0,
-      22,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      37,
-      22,
-      0,
-      11,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      38,
-      22,
-      1,
-      11,
-      1,
-      "LATENT"
-    ],
    [
      54,
      20,
@ -367,14 +344,6 @@
      0,
      "CLIP"
    ],
-    [
-      55,
-      30,
-      0,
-      22,
-      1,
-      "CONDITIONING"
-    ],
    [
      56,
      20,
@ -383,14 +352,6 @@
      0,
      "CLIP"
    ],
-    [
-      57,
-      31,
-      0,
-      22,
-      2,
-      "CONDITIONING"
-    ],
    [
      59,
      11,
@ -398,16 +359,56 @@
      33,
      0,
      "IMAGE"
+    ],
+    [
+      60,
+      1,
+      0,
+      34,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      61,
+      30,
+      0,
+      34,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      62,
+      31,
+      0,
+      34,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      63,
+      34,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      64,
+      34,
+      1,
+      11,
+      1,
+      "LATENT"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.7513148009015782,
+      "scale": 0.7513148009015777,
      "offset": [
-        106.37225000664994,
-        78.14886929032406
+        209.1392882550122,
+        105.74671444060245
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -31,7 +31,7 @@ class DownloadAndLoadCogVideoModel:
                        "fp32",
                        "bf16",
                    ],
-                    {"default": "bf16"},
+                    {"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"},
                ),
            },
        }
@ -209,13 +209,12 @@ class CogVideoSampler:
                "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
                "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
                "num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 1}),
-                "fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
-                "steps": ("INT", {"default": 25, "min": 1}),
+                "steps": ("INT", {"default": 50, "min": 1}),
                "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
-                "scheduler": (["DDIM", "DPM"],),
-                "t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1}),
-                "t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1}),
+                "scheduler": (["DDIM", "DPM"], {"tooltip": "5B likes DPM, but it doesn't support temporal tiling"}),
+                "t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1, "tooltip": "Length of temporal tiling, use same alue as num_frames to disable, disabled automatically for DPM"}),
+                "t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1, "tooltip": "Overlap of temporal tiling"}),
            },
            "optional": {
                "samples": ("LATENT", ),
@ -228,7 +227,7 @@ class CogVideoSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

-    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
+    def process(self, pipeline, positive, negative, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
        mm.soft_empty_cache()

        assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
@ -257,7 +256,6 @@ class CogVideoSampler:
            num_frames = num_frames,
            t_tile_length = t_tile_length,
            t_tile_overlap = t_tile_overlap,
-            fps = fps,
            guidance_scale=cfg,
            latents=samples["samples"] if samples is not None else None,
            denoise_strength=denoise_strength,
@ -269,8 +267,6 @@ class CogVideoSampler:
        pipe.transformer.to(offload_device)
        mm.soft_empty_cache()
        print(latents.shape)
-        pipeline["fps"] = fps
-        pipeline["num_frames"] = num_frames

        return (pipeline, {"samples": latents})
    
@ -280,6 +276,7 @@ class CogVideoDecode:
        return {"required": {
            "pipeline": ("COGVIDEOPIPE",),
            "samples": ("LATENT", ),
+            "enable_vae_tiling": ("BOOLEAN", {"default": False}),
            }
        }

@ -288,37 +285,27 @@ class CogVideoDecode:
    FUNCTION = "decode"
    CATEGORY = "CogVideoWrapper"

-    def decode(self, pipeline, samples):
+    def decode(self, pipeline, samples, enable_vae_tiling):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        latents = samples["samples"]
        vae = pipeline["pipe"].vae
        vae.to(device)
+        if enable_vae_tiling:
+            vae.enable_tiling(
+                tile_sample_min_height=96,
+                tile_sample_min_width=96,
+                tile_overlap_factor_height=1 / 12,
+                tile_overlap_factor_width=1 / 12,
+            )

-        if "num_frames" in pipeline:
-            num_frames = pipeline["num_frames"]
-            fps = pipeline["fps"]
-        else:
-            num_frames = latents.shape[2]
-            fps = 8
-
-        num_seconds = num_frames // fps
        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
        latents = 1 / vae.config.scaling_factor * latents

-        frames = []
-        pbar = ProgressBar(num_seconds)
-        # for i in range(num_seconds):
-        #     start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
-        #     current_frames = vae.decode(latents[:, :, start_frame:end_frame]).sample
-        #     frames.append(current_frames)
-            
-        #     pbar.update(1)
        frames = vae.decode(latents).sample
        vae.to(offload_device)
        mm.soft_empty_cache()

-        #frames = torch.cat(frames, dim=2)
        video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
        video = video[0].permute(0, 2, 3, 1).cpu().float()
        print(video.min(), video.max())
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -315,7 +315,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        num_frames: int = 48,
        t_tile_length: int = 12,
        t_tile_overlap: int = 4,
-        fps: int = 8,
        num_inference_steps: int = 50,
        timesteps: Optional[List[int]] = None,
        guidance_scale: float = 6,