From 06b5e021adb04015864464a22b343d1de285f821 Mon Sep 17 00:00:00 2001
From: kijai <40791699+kijai@users.noreply.github.com>
Date: Tue, 27 Aug 2024 18:01:36 +0300
Subject: [PATCH] cleanup, update example

---
 examples/cogvideox_5b_example_01.json | 243 +++++++++++++-------------
 nodes.py                              |  43 ++---
 pipeline_cogvideox.py                 |   1 -
 3 files changed, 137 insertions(+), 150 deletions(-)

diff --git a/examples/cogvideox_5b_example_01.json b/examples/cogvideox_5b_example_01.json
index f3b2e53..50d0668 100644
--- a/examples/cogvideox_5b_example_01.json
+++ b/examples/cogvideox_5b_example_01.json
@@ -1,41 +1,7 @@
 {
-  "last_node_id": 33,
-  "last_link_id": 59,
+  "last_node_id": 34,
+  "last_link_id": 64,
   "nodes": [
-    {
-      "id": 20,
-      "type": "CLIPLoader",
-      "pos": [
-        -59,
-        397
-      ],
-      "size": {
-        "0": 451.30548095703125,
-        "1": 82
-      },
-      "flags": {},
-      "order": 0,
-      "mode": 0,
-      "outputs": [
-        {
-          "name": "CLIP",
-          "type": "CLIP",
-          "links": [
-            54,
-            56
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CLIPLoader"
-      },
-      "widgets_values": [
-        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
-        "sd3"
-      ]
-    },
     {
       "id": 31,
       "type": "CogVideoTextEncode",
@@ -62,7 +28,7 @@
           "name": "conditioning",
           "type": "CONDITIONING",
           "links": [
-            57
+            62
           ],
           "slot_index": 0,
           "shape": 3
@@ -84,7 +50,7 @@
       ],
       "size": {
         "0": 210,
-        "1": 46
+        "1": 78
       },
       "flags": {},
       "order": 5,
@@ -93,12 +59,12 @@
         {
           "name": "pipeline",
           "type": "COGVIDEOPIPE",
-          "link": 37
+          "link": 63
         },
         {
           "name": "samples",
           "type": "LATENT",
-          "link": 38
+          "link": 64
         }
       ],
       "outputs": [
@@ -114,7 +80,10 @@
       ],
       "properties": {
         "Node name for S&R": "CogVideoDecode"
-      }
+      },
+      "widgets_values": [
+        false
+      ]
     },
     {
       "id": 1,
@@ -128,14 +97,14 @@
         "1": 82
       },
       "flags": {},
-      "order": 1,
+      "order": 0,
       "mode": 0,
       "outputs": [
         {
           "name": "cogvideo_pipe",
           "type": "COGVIDEOPIPE",
           "links": [
-            36
+            60
           ],
           "slot_index": 0,
           "shape": 3
@@ -150,16 +119,55 @@
       ]
     },
     {
-      "id": 22,
+      "id": 30,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        500,
+        308
+      ],
+      "size": [
+        471.90143257018326,
+        168.0804709842023
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 54
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            61
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n"
+      ]
+    },
+    {
+      "id": 34,
       "type": "CogVideoSampler",
       "pos": [
         1041,
         342
       ],
-      "size": {
-        "0": 315,
-        "1": 382
-      },
+      "size": [
+        315.84047081854465,
+        358
+      ],
       "flags": {},
       "order": 4,
       "mode": 0,
@@ -167,18 +175,17 @@
         {
           "name": "pipeline",
           "type": "COGVIDEOPIPE",
-          "link": 36
+          "link": 60
         },
         {
           "name": "positive",
           "type": "CONDITIONING",
-          "link": 55,
-          "slot_index": 1
+          "link": 61
         },
         {
           "name": "negative",
           "type": "CONDITIONING",
-          "link": 57
+          "link": 62
         },
         {
           "name": "samples",
@@ -191,7 +198,7 @@
           "name": "cogvideo_pipe",
           "type": "COGVIDEOPIPE",
           "links": [
-            37
+            63
           ],
           "shape": 3
         },
@@ -199,7 +206,7 @@
           "name": "samples",
           "type": "LATENT",
           "links": [
-            38
+            64
           ],
           "shape": 3
         }
@@ -211,9 +218,8 @@
         480,
         720,
         49,
-        8,
         50,
-        7,
+        6,
         806286757407563,
         "fixed",
         "DPM",
@@ -226,8 +232,8 @@
       "id": 33,
       "type": "VHS_VideoCombine",
       "pos": [
-        1533,
-        136
+        1441,
+        129
       ],
       "size": [
         778.7022705078125,
@@ -284,7 +290,7 @@
           "hidden": false,
           "paused": false,
           "params": {
-            "filename": "CogVideoX5B.mp4",
+            "filename": "CogVideoX5B_00009.mp4",
             "subfolder": "",
             "type": "temp",
             "format": "video/nvenc_h264-mp4",
@@ -295,70 +301,41 @@
       }
     },
     {
-      "id": 30,
-      "type": "CogVideoTextEncode",
+      "id": 20,
+      "type": "CLIPLoader",
       "pos": [
-        500,
-        308
+        -26,
+        400
       ],
       "size": {
-        "0": 474.8450012207031,
-        "1": 164.7423553466797
+        "0": 451.30548095703125,
+        "1": 82
       },
       "flags": {},
-      "order": 2,
+      "order": 1,
       "mode": 0,
-      "inputs": [
-        {
-          "name": "clip",
-          "type": "CLIP",
-          "link": 54
-        }
-      ],
       "outputs": [
         {
-          "name": "conditioning",
-          "type": "CONDITIONING",
+          "name": "CLIP",
+          "type": "CLIP",
           "links": [
-            55
+            54,
+            56
           ],
           "slot_index": 0,
           "shape": 3
         }
       ],
       "properties": {
-        "Node name for S&R": "CogVideoTextEncode"
+        "Node name for S&R": "CLIPLoader"
       },
       "widgets_values": [
-        "The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from its tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.\n"
+        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
+        "sd3"
       ]
     }
   ],
   "links": [
-    [
-      36,
-      1,
-      0,
-      22,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      37,
-      22,
-      0,
-      11,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      38,
-      22,
-      1,
-      11,
-      1,
-      "LATENT"
-    ],
     [
       54,
       20,
@@ -367,14 +344,6 @@
       0,
       "CLIP"
     ],
-    [
-      55,
-      30,
-      0,
-      22,
-      1,
-      "CONDITIONING"
-    ],
     [
       56,
       20,
@@ -383,14 +352,6 @@
       0,
       "CLIP"
     ],
-    [
-      57,
-      31,
-      0,
-      22,
-      2,
-      "CONDITIONING"
-    ],
     [
       59,
       11,
@@ -398,16 +359,56 @@
       33,
       0,
       "IMAGE"
+    ],
+    [
+      60,
+      1,
+      0,
+      34,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      61,
+      30,
+      0,
+      34,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      62,
+      31,
+      0,
+      34,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      63,
+      34,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      64,
+      34,
+      1,
+      11,
+      1,
+      "LATENT"
     ]
   ],
   "groups": [],
   "config": {},
   "extra": {
     "ds": {
-      "scale": 0.7513148009015782,
+      "scale": 0.7513148009015777,
       "offset": [
-        106.37225000664994,
-        78.14886929032406
+        209.1392882550122,
+        105.74671444060245
       ]
     }
   },
diff --git a/nodes.py b/nodes.py
index 4ca561a..6ab552b 100644
--- a/nodes.py
+++ b/nodes.py
@@ -31,7 +31,7 @@ class DownloadAndLoadCogVideoModel:
                         "fp32",
                         "bf16",
                     ],
-                    {"default": "bf16"},
+                    {"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"},
                 ),
             },
         }
@@ -209,13 +209,12 @@ class CogVideoSampler:
                 "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
                 "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
                 "num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 1}),
-                "fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
-                "steps": ("INT", {"default": 25, "min": 1}),
+                "steps": ("INT", {"default": 50, "min": 1}),
                 "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                 "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
-                "scheduler": (["DDIM", "DPM"],),
-                "t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1}),
-                "t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1}),
+                "scheduler": (["DDIM", "DPM"], {"tooltip": "5B likes DPM, but it doesn't support temporal tiling"}),
+                "t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1, "tooltip": "Length of temporal tiling, use same alue as num_frames to disable, disabled automatically for DPM"}),
+                "t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1, "tooltip": "Overlap of temporal tiling"}),
             },
             "optional": {
                 "samples": ("LATENT", ),
@@ -228,7 +227,7 @@ class CogVideoSampler:
     FUNCTION = "process"
     CATEGORY = "CogVideoWrapper"
 
-    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
+    def process(self, pipeline, positive, negative, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
         mm.soft_empty_cache()
 
         assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
@@ -257,7 +256,6 @@ class CogVideoSampler:
             num_frames = num_frames,
             t_tile_length = t_tile_length,
             t_tile_overlap = t_tile_overlap,
-            fps = fps,
             guidance_scale=cfg,
             latents=samples["samples"] if samples is not None else None,
             denoise_strength=denoise_strength,
@@ -269,8 +267,6 @@ class CogVideoSampler:
         pipe.transformer.to(offload_device)
         mm.soft_empty_cache()
         print(latents.shape)
-        pipeline["fps"] = fps
-        pipeline["num_frames"] = num_frames
 
         return (pipeline, {"samples": latents})
     
@@ -280,6 +276,7 @@ class CogVideoDecode:
         return {"required": {
             "pipeline": ("COGVIDEOPIPE",),
             "samples": ("LATENT", ),
+            "enable_vae_tiling": ("BOOLEAN", {"default": False}),
             }
         }
 
@@ -288,37 +285,27 @@ class CogVideoDecode:
     FUNCTION = "decode"
     CATEGORY = "CogVideoWrapper"
 
-    def decode(self, pipeline, samples):
+    def decode(self, pipeline, samples, enable_vae_tiling):
         device = mm.get_torch_device()
         offload_device = mm.unet_offload_device()
         latents = samples["samples"]
         vae = pipeline["pipe"].vae
         vae.to(device)
+        if enable_vae_tiling:
+            vae.enable_tiling(
+                tile_sample_min_height=96,
+                tile_sample_min_width=96,
+                tile_overlap_factor_height=1 / 12,
+                tile_overlap_factor_width=1 / 12,
+            )
 
-        if "num_frames" in pipeline:
-            num_frames = pipeline["num_frames"]
-            fps = pipeline["fps"]
-        else:
-            num_frames = latents.shape[2]
-            fps = 8
-
-        num_seconds = num_frames // fps
         latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
         latents = 1 / vae.config.scaling_factor * latents
 
-        frames = []
-        pbar = ProgressBar(num_seconds)
-        # for i in range(num_seconds):
-        #     start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
-        #     current_frames = vae.decode(latents[:, :, start_frame:end_frame]).sample
-        #     frames.append(current_frames)
-            
-        #     pbar.update(1)
         frames = vae.decode(latents).sample
         vae.to(offload_device)
         mm.soft_empty_cache()
 
-        #frames = torch.cat(frames, dim=2)
         video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
         video = video[0].permute(0, 2, 3, 1).cpu().float()
         print(video.min(), video.max())
diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py
index edceb4c..496a3c3 100644
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@@ -315,7 +315,6 @@ class CogVideoXPipeline(DiffusionPipeline):
         num_frames: int = 48,
         t_tile_length: int = 12,
         t_tile_overlap: int = 4,
-        fps: int = 8,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
         guidance_scale: float = 6,