update examples, expose scheduler, force T5 offload

2026-03-16 15:17:16 +08:00 · 2024-08-07 01:10:19 +03:00 · 2024-08-07 01:10:19 +03:00 · 97e89d596e
commit 97e89d596e
parent 8a0af3b663
4 changed files with 436 additions and 410 deletions
--- a/examples/cogvideo_vid2vid_test_example_01.json
+++ b/examples/cogvideo_vid2vid_test_example_01.json
@ -1,46 +1,7 @@
 {
-  "last_node_id": 59,
-  "last_link_id": 137,
+  "last_node_id": 64,
+  "last_link_id": 167,
  "nodes": [
-    {
-      "id": 31,
-      "type": "CogVideoTextEncode",
-      "pos": [
-        503,
-        521
-      ],
-      "size": {
-        "0": 463.01251220703125,
-        "1": 98.10446166992188
-      },
-      "flags": {},
-      "order": 4,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "clip",
-          "type": "CLIP",
-          "link": 56
-        }
-      ],
-      "outputs": [
-        {
-          "name": "conditioning",
-          "type": "CONDITIONING",
-          "links": [
-            80
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoTextEncode"
-      },
-      "widgets_values": [
-        ""
-      ]
-    },
    {
      "id": 1,
      "type": "DownloadAndLoadCogVideoModel",
@ -60,8 +21,8 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            78,
-            83
+            83,
+            159
          ],
          "shape": 3,
          "slot_index": 0
@ -108,47 +69,6 @@
        "sd3"
      ]
    },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1199,
-        661
-      ],
-      "size": {
-        "0": 210,
-        "1": 46
-      },
-      "flags": {},
-      "order": 9,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 81
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 82
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            118
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      }
-    },
    {
      "id": 56,
      "type": "SimpleMath+",
@ -233,7 +153,7 @@
          "name": "samples",
          "type": "LATENT",
          "links": [
-            122
+            162
          ],
          "shape": 3,
          "slot_index": 0
@ -301,84 +221,6 @@
        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
-    {
-      "id": 41,
-      "type": "ImageResizeKJ",
-      "pos": [
-        315,
-        -19
-      ],
-      "size": {
-        "0": 315,
-        "1": 242
-      },
-      "flags": {},
-      "order": 5,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 128
-        },
-        {
-          "name": "get_image_size",
-          "type": "IMAGE",
-          "link": null
-        },
-        {
-          "name": "width_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "width_input"
-          }
-        },
-        {
-          "name": "height_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "height_input"
-          }
-        }
-      ],
-      "outputs": [
-        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": [
-            126
-          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "width",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "height",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "ImageResizeKJ"
-      },
-      "widgets_values": [
-        720,
-        480,
-        "lanczos",
-        false,
-        2,
-        0,
-        0
-      ]
-    },
    {
      "id": 59,
      "type": "GetImageRangeFromBatch",
@ -448,10 +290,10 @@
        1451,
        368
      ],
-      "size": [
-        315,
-        102
-      ],
+      "size": {
+        "0": 315,
+        "1": 102
+      },
      "flags": {
        "collapsed": true
      },
@ -552,12 +394,12 @@
      "id": 47,
      "type": "VHS_VideoCombine",
      "pos": [
-        1789,
+        1790,
        -104
      ],
      "size": [
-        1113.3311767578125,
-        712.4437255859375
+        1110,
+        711.3333333333333
      ],
      "flags": {},
      "order": 15,
@ -610,7 +452,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "AnimateDiff_00011.mp4",
+            "filename": "AnimateDiff_00008.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/nvenc_h264-mp4",
@ -619,6 +461,190 @@
        }
      }
    },
+    {
+      "id": 57,
+      "type": "GetImageSizeAndCount",
+      "pos": [
+        674,
+        2
+      ],
+      "size": {
+        "0": 210,
+        "1": 86
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 126,
+          "slot_index": 0
+        }
+      ],
+      "outputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "links": [
+            129,
+            136
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "720 width",
+          "type": "INT",
+          "links": [
+            165
+          ],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "480 height",
+          "type": "INT",
+          "links": [
+            164
+          ],
+          "shape": 3,
+          "slot_index": 2
+        },
+        {
+          "name": "16 count",
+          "type": "INT",
+          "links": [
+            163
+          ],
+          "shape": 3,
+          "slot_index": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GetImageSizeAndCount"
+      }
+    },
+    {
+      "id": 41,
+      "type": "ImageResizeKJ",
+      "pos": [
+        315,
+        -19
+      ],
+      "size": {
+        "0": 315,
+        "1": 242
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 128
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          }
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            126
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
+      "widgets_values": [
+        720,
+        480,
+        "lanczos",
+        false,
+        2,
+        0,
+        0
+      ]
+    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": [
+        1201,
+        684
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 166
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 167
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            118
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      }
+    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
@ -645,7 +671,7 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            79
+            160
          ],
          "shape": 3,
          "slot_index": 0
@ -655,84 +681,46 @@
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        "video of dinosaur turning it's head in a cinematic and dramatic scene from a movie"
+        "cinematic video of a red panda turning it's head"
      ]
    },
    {
-      "id": 36,
-      "type": "CogVideoSampler",
+      "id": 31,
+      "type": "CogVideoTextEncode",
      "pos": [
-        1093,
-        292
-      ],
-      "size": [
-        315,
-        310
+        503,
+        521
      ],
+      "size": {
+        "0": 463.01251220703125,
+        "1": 98.10446166992188
+      },
      "flags": {},
-      "order": 8,
+      "order": 4,
      "mode": 0,
      "inputs": [
        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 78
-        },
-        {
-          "name": "positive",
-          "type": "CONDITIONING",
-          "link": 79
-        },
-        {
-          "name": "negative",
-          "type": "CONDITIONING",
-          "link": 80
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 122
-        },
-        {
-          "name": "num_frames",
-          "type": "INT",
-          "link": 137,
-          "widget": {
-            "name": "num_frames"
-          }
+          "name": "clip",
+          "type": "CLIP",
+          "link": 56
        }
      ],
      "outputs": [
        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
+          "name": "conditioning",
+          "type": "CONDITIONING",
          "links": [
-            81
+            161
          ],
-          "shape": 3
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "links": [
-            82
-          ],
-          "shape": 3
+          "shape": 3,
+          "slot_index": 0
        }
      ],
      "properties": {
-        "Node name for S&R": "CogVideoSampler"
+        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        480,
-        720,
-        16,
-        8,
-        25,
-        8,
-        1119546789766856,
-        "fixed",
-        0.8
+        "bad quality video, blurry, messy"
      ]
    },
    {
@ -819,63 +807,98 @@
      }
    },
    {
-      "id": 57,
-      "type": "GetImageSizeAndCount",
+      "id": 64,
+      "type": "CogVideoSampler",
      "pos": [
-        674,
-        2
+        1090,
+        290
      ],
      "size": {
-        "0": 210,
-        "1": 86
+        "0": 315,
+        "1": 342
      },
      "flags": {},
-      "order": 6,
+      "order": 8,
      "mode": 0,
      "inputs": [
        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 126,
-          "slot_index": 0
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 159
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 160
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 161
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 162
+        },
+        {
+          "name": "num_frames",
+          "type": "INT",
+          "link": 163,
+          "widget": {
+            "name": "num_frames"
+          }
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "link": 164,
+          "widget": {
+            "name": "height"
+          }
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "link": 165,
+          "widget": {
+            "name": "width"
+          }
        }
      ],
      "outputs": [
        {
-          "name": "image",
-          "type": "IMAGE",
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
          "links": [
-            129,
-            136
+            166
          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "720 width",
-          "type": "INT",
-          "links": null,
          "shape": 3
        },
        {
-          "name": "480 height",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "16 count",
-          "type": "INT",
+          "name": "samples",
+          "type": "LATENT",
          "links": [
-            137
+            167
          ],
-          "shape": 3,
-          "slot_index": 3
+          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "GetImageSizeAndCount"
-      }
+        "Node name for S&R": "CogVideoSampler"
+      },
+      "widgets_values": [
+        480,
+        720,
+        48,
+        8,
+        35,
+        9,
+        6,
+        "fixed",
+        "DPM",
+        0.7000000000000001
+      ]
    }
  ],
  "links": [
@ -895,46 +918,6 @@
      0,
      "CLIP"
    ],
-    [
-      78,
-      1,
-      0,
-      36,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      79,
-      30,
-      0,
-      36,
-      1,
-      "CONDITIONING"
-    ],
-    [
-      80,
-      31,
-      0,
-      36,
-      2,
-      "CONDITIONING"
-    ],
-    [
-      81,
-      36,
-      0,
-      11,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      82,
-      36,
-      1,
-      11,
-      1,
-      "LATENT"
-    ],
    [
      83,
      1,
@ -975,14 +958,6 @@
      0,
      "INT,FLOAT"
    ],
-    [
-      122,
-      37,
-      0,
-      36,
-      3,
-      "LATENT"
-    ],
    [
      126,
      41,
@ -1048,22 +1023,86 @@
      "IMAGE"
    ],
    [
-      137,
+      159,
+      1,
+      0,
+      64,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      160,
+      30,
+      0,
+      64,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      161,
+      31,
+      0,
+      64,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      162,
+      37,
+      0,
+      64,
+      3,
+      "LATENT"
+    ],
+    [
+      163,
      57,
      3,
-      36,
+      64,
      4,
      "INT"
+    ],
+    [
+      164,
+      57,
+      2,
+      64,
+      5,
+      "INT"
+    ],
+    [
+      165,
+      57,
+      1,
+      64,
+      6,
+      "INT"
+    ],
+    [
+      166,
+      64,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      167,
+      64,
+      1,
+      11,
+      1,
+      "LATENT"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.7513148009015777,
+      "scale": 0.6830134553650705,
      "offset": [
-        45.633655208726886,
-        389.8041242612087
+        56.628416841109384,
+        394.7727729054069
      ]
    }
  },
--- a/examples/example_01.json
+++ b/examples/example_01.json
@ -11,7 +11,7 @@
      ],
      "size": {
        "0": 315,
-        "1": 266
+        "1": 334
      },
      "flags": {},
      "order": 4,
@ -32,6 +32,11 @@
          "name": "negative",
          "type": "CONDITIONING",
          "link": 57
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": null
        }
      ],
      "outputs": [
@ -63,50 +68,11 @@
        25,
        6,
        806286757407561,
-        "fixed"
+        "fixed",
+        "DDIM",
+        1
      ]
    },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1142,
-        658
-      ],
-      "size": {
-        "0": 210,
-        "1": 46
-      },
-      "flags": {},
-      "order": 5,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 37
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 38
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            51
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      }
-    },
    {
      "id": 28,
      "type": "VHS_VideoCombine",
@ -169,7 +135,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "AnimateDiff_00001.mp4",
+            "filename": "CogVideoX_00001.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
@ -185,10 +151,10 @@
        500,
        308
      ],
-      "size": [
-        474.84501511852204,
-        164.74235966960538
-      ],
+      "size": {
+        "0": 474.8450012207031,
+        "1": 164.7423553466797
+      },
      "flags": {},
      "order": 2,
      "mode": 0,
@ -258,10 +224,10 @@
        503,
        521
      ],
-      "size": [
-        463.01251866466464,
-        98.10446321574796
-      ],
+      "size": {
+        "0": 463.01251220703125,
+        "1": 98.10446166992188
+      },
      "flags": {},
      "order": 3,
      "mode": 0,
@ -321,6 +287,47 @@
      "widgets_values": [
        "fp16"
      ]
+    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": [
+        1138,
+        725
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 37
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 38
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            51
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      }
    }
  ],
  "links": [
@ -393,10 +400,10 @@
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.6830134553650706,
+      "scale": 0.9090909090909092,
      "offset": [
-        359.4381777891929,
-        334.95283678425216
+        12.99028921497383,
+        38.21608107136124
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -2,7 +2,7 @@ import os
 import torch
 import folder_paths
 import comfy.model_management as mm
-
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
 from .pipeline_cogvideox import CogVideoXPipeline

 import logging
@ -54,11 +54,11 @@ class DownloadAndLoadCogVideoModel:
            )

        pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
-        

        pipeline = {
            "pipe": pipe,
-            "dtype": dtype
+            "dtype": dtype,
+            "base_path": base_path
        }

        return (pipeline,)
@ -115,11 +115,15 @@ class CogVideoTextEncode:
    CATEGORY = "CogVideoWrapper"

    def process(self, clip, prompt):
+        load_device = mm.text_encoder_device()
+        offload_device = mm.text_encoder_offload_device()
        clip.tokenizer.t5xxl.pad_to_max_length = True
        clip.tokenizer.t5xxl.max_length = 226
+        clip.cond_stage_model.to(load_device)
        tokens = clip.tokenize(prompt, return_word_ids=True)

        embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
+        clip.cond_stage_model.to(offload_device)

        return (embeds, )
    
@ -194,6 +198,7 @@ class CogVideoSampler:
                "steps": ("INT", {"default": 25, "min": 1}),
                "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+                "scheduler": (["DDIM", "DPM"],),
            },
            "optional": {
                "samples": ("LATENT", ),
@ -206,16 +211,22 @@ class CogVideoSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

-    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, samples=None, denoise_strength=1.0):
+    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
        mm.soft_empty_cache()
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        pipe = pipeline["pipe"]
        dtype = pipeline["dtype"]
+        base_path = pipeline["base_path"]

        pipe.transformer.to(device)
        generator = torch.Generator(device=device).manual_seed(seed)

+        if scheduler == "DDIM":
+            pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
+        elif scheduler == "DPM":
+            pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
+
        latents = pipeline["pipe"](
            num_inference_steps=steps,
            height = height,
@ -227,7 +238,6 @@ class CogVideoSampler:
            denoise_strength=denoise_strength,
            prompt_embeds=positive.to(dtype).to(device),
            negative_prompt_embeds=negative.to(dtype).to(device),
-            #negative_prompt_embeds=torch.zeros_like(embeds),
            generator=generator,
            output_type="latents",
            device=device
@ -264,11 +274,10 @@ class CogVideoDecode:
        if "num_frames" in pipeline:
            num_frames = pipeline["num_frames"]
            fps = pipeline["fps"]
-
-            
        else:
            num_frames = latents.shape[2]
            fps = 8
+
        num_seconds = num_frames // fps
        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
        latents = 1 / vae.config.scaling_factor * latents
@ -278,17 +287,14 @@ class CogVideoDecode:
            # Whether or not to clear fake context parallel cache
            fake_cp = i + 1 < num_seconds
            start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
-
            current_frames = vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
            frames.append(current_frames)
+            mm.soft_empty_cache()
        vae.to(offload_device)

        frames = torch.cat(frames, dim=2)
-        print(frames.min(), frames.max())
        video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
-        print(video.shape)
        video = video[0].permute(0, 2, 3, 1).cpu().float()
-        print(video.min(), video.max())

        return (video,)

--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -222,22 +222,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        latents = latents * self.scheduler.init_noise_sigma
        return latents, timesteps

-    def decode_latents(self, latents: torch.Tensor, num_seconds: int):
-        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
-        latents = 1 / self.vae.config.scaling_factor * latents
-
-        frames = []
-        for i in range(num_seconds):
-            # Whether or not to clear fake context parallel cache
-            fake_cp = i + 1 < num_seconds
-            start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
-
-            current_frames = self.vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
-            frames.append(current_frames)
-
-        frames = torch.cat(frames, dim=2)
-        return frames
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@ -534,17 +518,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                    progress_bar.update()
                    comfy_pbar.update(1)

-        if not output_type == "latents":
-            video = self.decode_latents(latents, num_frames // fps)
-            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
-        else:
-            video = latents
-            print(video.shape)
-
        # Offload all models
        self.maybe_free_model_hooks()

-        if not return_dict:
-            return (video,)
        return latents
-        #return CogVideoXPipelineOutput(frames=video)