From 97e89d596e00bbdc97c7860270b5ad719eae2b28 Mon Sep 17 00:00:00 2001
From: kijai <40791699+kijai@users.noreply.github.com>
Date: Wed, 7 Aug 2024 01:10:19 +0300
Subject: [PATCH] update examples, expose scheduler, force T5 offload

---
 .../cogvideo_vid2vid_test_example_01.json     | 675 +++++++++---------
 examples/example_01.json                      | 117 +--
 nodes.py                                      |  28 +-
 pipeline_cogvideox.py                         |  26 -
 4 files changed, 436 insertions(+), 410 deletions(-)

diff --git a/examples/cogvideo_vid2vid_test_example_01.json b/examples/cogvideo_vid2vid_test_example_01.json
index 19efdd5..93b1532 100644
--- a/examples/cogvideo_vid2vid_test_example_01.json
+++ b/examples/cogvideo_vid2vid_test_example_01.json
@@ -1,46 +1,7 @@
 {
-  "last_node_id": 59,
-  "last_link_id": 137,
+  "last_node_id": 64,
+  "last_link_id": 167,
   "nodes": [
-    {
-      "id": 31,
-      "type": "CogVideoTextEncode",
-      "pos": [
-        503,
-        521
-      ],
-      "size": {
-        "0": 463.01251220703125,
-        "1": 98.10446166992188
-      },
-      "flags": {},
-      "order": 4,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "clip",
-          "type": "CLIP",
-          "link": 56
-        }
-      ],
-      "outputs": [
-        {
-          "name": "conditioning",
-          "type": "CONDITIONING",
-          "links": [
-            80
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoTextEncode"
-      },
-      "widgets_values": [
-        ""
-      ]
-    },
     {
       "id": 1,
       "type": "DownloadAndLoadCogVideoModel",
@@ -60,8 +21,8 @@
           "name": "cogvideo_pipe",
           "type": "COGVIDEOPIPE",
           "links": [
-            78,
-            83
+            83,
+            159
           ],
           "shape": 3,
           "slot_index": 0
@@ -108,47 +69,6 @@
         "sd3"
       ]
     },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1199,
-        661
-      ],
-      "size": {
-        "0": 210,
-        "1": 46
-      },
-      "flags": {},
-      "order": 9,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 81
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 82
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            118
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      }
-    },
     {
       "id": 56,
       "type": "SimpleMath+",
@@ -233,7 +153,7 @@
           "name": "samples",
           "type": "LATENT",
           "links": [
-            122
+            162
           ],
           "shape": 3,
           "slot_index": 0
@@ -301,84 +221,6 @@
         "Node name for S&R": "GetImageSizeAndCount"
       }
     },
-    {
-      "id": 41,
-      "type": "ImageResizeKJ",
-      "pos": [
-        315,
-        -19
-      ],
-      "size": {
-        "0": 315,
-        "1": 242
-      },
-      "flags": {},
-      "order": 5,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 128
-        },
-        {
-          "name": "get_image_size",
-          "type": "IMAGE",
-          "link": null
-        },
-        {
-          "name": "width_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "width_input"
-          }
-        },
-        {
-          "name": "height_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "height_input"
-          }
-        }
-      ],
-      "outputs": [
-        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": [
-            126
-          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "width",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "height",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "ImageResizeKJ"
-      },
-      "widgets_values": [
-        720,
-        480,
-        "lanczos",
-        false,
-        2,
-        0,
-        0
-      ]
-    },
     {
       "id": 59,
       "type": "GetImageRangeFromBatch",
@@ -448,10 +290,10 @@
         1451,
         368
       ],
-      "size": [
-        315,
-        102
-      ],
+      "size": {
+        "0": 315,
+        "1": 102
+      },
       "flags": {
         "collapsed": true
       },
@@ -552,12 +394,12 @@
       "id": 47,
       "type": "VHS_VideoCombine",
       "pos": [
-        1789,
+        1790,
         -104
       ],
       "size": [
-        1113.3311767578125,
-        712.4437255859375
+        1110,
+        711.3333333333333
       ],
       "flags": {},
       "order": 15,
@@ -610,7 +452,7 @@
           "hidden": false,
           "paused": false,
           "params": {
-            "filename": "AnimateDiff_00011.mp4",
+            "filename": "AnimateDiff_00008.mp4",
             "subfolder": "",
             "type": "temp",
             "format": "video/nvenc_h264-mp4",
@@ -619,6 +461,190 @@
         }
       }
     },
+    {
+      "id": 57,
+      "type": "GetImageSizeAndCount",
+      "pos": [
+        674,
+        2
+      ],
+      "size": {
+        "0": 210,
+        "1": 86
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 126,
+          "slot_index": 0
+        }
+      ],
+      "outputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "links": [
+            129,
+            136
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "720 width",
+          "type": "INT",
+          "links": [
+            165
+          ],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "480 height",
+          "type": "INT",
+          "links": [
+            164
+          ],
+          "shape": 3,
+          "slot_index": 2
+        },
+        {
+          "name": "16 count",
+          "type": "INT",
+          "links": [
+            163
+          ],
+          "shape": 3,
+          "slot_index": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GetImageSizeAndCount"
+      }
+    },
+    {
+      "id": 41,
+      "type": "ImageResizeKJ",
+      "pos": [
+        315,
+        -19
+      ],
+      "size": {
+        "0": 315,
+        "1": 242
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 128
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          }
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            126
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
+      "widgets_values": [
+        720,
+        480,
+        "lanczos",
+        false,
+        2,
+        0,
+        0
+      ]
+    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": [
+        1201,
+        684
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 166
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 167
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            118
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      }
+    },
     {
       "id": 30,
       "type": "CogVideoTextEncode",
@@ -645,7 +671,7 @@
           "name": "conditioning",
           "type": "CONDITIONING",
           "links": [
-            79
+            160
           ],
           "shape": 3,
           "slot_index": 0
@@ -655,84 +681,46 @@
         "Node name for S&R": "CogVideoTextEncode"
       },
       "widgets_values": [
-        "video of dinosaur turning it's head in a cinematic and dramatic scene from a movie"
+        "cinematic video of a red panda turning it's head"
       ]
     },
     {
-      "id": 36,
-      "type": "CogVideoSampler",
+      "id": 31,
+      "type": "CogVideoTextEncode",
       "pos": [
-        1093,
-        292
-      ],
-      "size": [
-        315,
-        310
+        503,
+        521
       ],
+      "size": {
+        "0": 463.01251220703125,
+        "1": 98.10446166992188
+      },
       "flags": {},
-      "order": 8,
+      "order": 4,
       "mode": 0,
       "inputs": [
         {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 78
-        },
-        {
-          "name": "positive",
-          "type": "CONDITIONING",
-          "link": 79
-        },
-        {
-          "name": "negative",
-          "type": "CONDITIONING",
-          "link": 80
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 122
-        },
-        {
-          "name": "num_frames",
-          "type": "INT",
-          "link": 137,
-          "widget": {
-            "name": "num_frames"
-          }
+          "name": "clip",
+          "type": "CLIP",
+          "link": 56
         }
       ],
       "outputs": [
         {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
+          "name": "conditioning",
+          "type": "CONDITIONING",
           "links": [
-            81
+            161
           ],
-          "shape": 3
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "links": [
-            82
-          ],
-          "shape": 3
+          "shape": 3,
+          "slot_index": 0
         }
       ],
       "properties": {
-        "Node name for S&R": "CogVideoSampler"
+        "Node name for S&R": "CogVideoTextEncode"
       },
       "widgets_values": [
-        480,
-        720,
-        16,
-        8,
-        25,
-        8,
-        1119546789766856,
-        "fixed",
-        0.8
+        "bad quality video, blurry, messy"
       ]
     },
     {
@@ -819,63 +807,98 @@
       }
     },
     {
-      "id": 57,
-      "type": "GetImageSizeAndCount",
+      "id": 64,
+      "type": "CogVideoSampler",
       "pos": [
-        674,
-        2
+        1090,
+        290
       ],
       "size": {
-        "0": 210,
-        "1": 86
+        "0": 315,
+        "1": 342
       },
       "flags": {},
-      "order": 6,
+      "order": 8,
       "mode": 0,
       "inputs": [
         {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 126,
-          "slot_index": 0
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 159
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 160
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 161
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 162
+        },
+        {
+          "name": "num_frames",
+          "type": "INT",
+          "link": 163,
+          "widget": {
+            "name": "num_frames"
+          }
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "link": 164,
+          "widget": {
+            "name": "height"
+          }
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "link": 165,
+          "widget": {
+            "name": "width"
+          }
         }
       ],
       "outputs": [
         {
-          "name": "image",
-          "type": "IMAGE",
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
           "links": [
-            129,
-            136
+            166
           ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "720 width",
-          "type": "INT",
-          "links": null,
           "shape": 3
         },
         {
-          "name": "480 height",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "16 count",
-          "type": "INT",
+          "name": "samples",
+          "type": "LATENT",
           "links": [
-            137
+            167
           ],
-          "shape": 3,
-          "slot_index": 3
+          "shape": 3
         }
       ],
       "properties": {
-        "Node name for S&R": "GetImageSizeAndCount"
-      }
+        "Node name for S&R": "CogVideoSampler"
+      },
+      "widgets_values": [
+        480,
+        720,
+        48,
+        8,
+        35,
+        9,
+        6,
+        "fixed",
+        "DPM",
+        0.7000000000000001
+      ]
     }
   ],
   "links": [
@@ -895,46 +918,6 @@
       0,
       "CLIP"
     ],
-    [
-      78,
-      1,
-      0,
-      36,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      79,
-      30,
-      0,
-      36,
-      1,
-      "CONDITIONING"
-    ],
-    [
-      80,
-      31,
-      0,
-      36,
-      2,
-      "CONDITIONING"
-    ],
-    [
-      81,
-      36,
-      0,
-      11,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      82,
-      36,
-      1,
-      11,
-      1,
-      "LATENT"
-    ],
     [
       83,
       1,
@@ -975,14 +958,6 @@
       0,
       "INT,FLOAT"
     ],
-    [
-      122,
-      37,
-      0,
-      36,
-      3,
-      "LATENT"
-    ],
     [
       126,
       41,
@@ -1048,22 +1023,86 @@
       "IMAGE"
     ],
     [
-      137,
+      159,
+      1,
+      0,
+      64,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      160,
+      30,
+      0,
+      64,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      161,
+      31,
+      0,
+      64,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      162,
+      37,
+      0,
+      64,
+      3,
+      "LATENT"
+    ],
+    [
+      163,
       57,
       3,
-      36,
+      64,
       4,
       "INT"
+    ],
+    [
+      164,
+      57,
+      2,
+      64,
+      5,
+      "INT"
+    ],
+    [
+      165,
+      57,
+      1,
+      64,
+      6,
+      "INT"
+    ],
+    [
+      166,
+      64,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      167,
+      64,
+      1,
+      11,
+      1,
+      "LATENT"
     ]
   ],
   "groups": [],
   "config": {},
   "extra": {
     "ds": {
-      "scale": 0.7513148009015777,
+      "scale": 0.6830134553650705,
       "offset": [
-        45.633655208726886,
-        389.8041242612087
+        56.628416841109384,
+        394.7727729054069
       ]
     }
   },
diff --git a/examples/example_01.json b/examples/example_01.json
index f707db7..1881508 100644
--- a/examples/example_01.json
+++ b/examples/example_01.json
@@ -11,7 +11,7 @@
       ],
       "size": {
         "0": 315,
-        "1": 266
+        "1": 334
       },
       "flags": {},
       "order": 4,
@@ -32,6 +32,11 @@
           "name": "negative",
           "type": "CONDITIONING",
           "link": 57
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": null
         }
       ],
       "outputs": [
@@ -63,50 +68,11 @@
         25,
         6,
         806286757407561,
-        "fixed"
+        "fixed",
+        "DDIM",
+        1
       ]
     },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1142,
-        658
-      ],
-      "size": {
-        "0": 210,
-        "1": 46
-      },
-      "flags": {},
-      "order": 5,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 37
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 38
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            51
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      }
-    },
     {
       "id": 28,
       "type": "VHS_VideoCombine",
@@ -169,7 +135,7 @@
           "hidden": false,
           "paused": false,
           "params": {
-            "filename": "AnimateDiff_00001.mp4",
+            "filename": "CogVideoX_00001.mp4",
             "subfolder": "",
             "type": "temp",
             "format": "video/h264-mp4",
@@ -185,10 +151,10 @@
         500,
         308
       ],
-      "size": [
-        474.84501511852204,
-        164.74235966960538
-      ],
+      "size": {
+        "0": 474.8450012207031,
+        "1": 164.7423553466797
+      },
       "flags": {},
       "order": 2,
       "mode": 0,
@@ -258,10 +224,10 @@
         503,
         521
       ],
-      "size": [
-        463.01251866466464,
-        98.10446321574796
-      ],
+      "size": {
+        "0": 463.01251220703125,
+        "1": 98.10446166992188
+      },
       "flags": {},
       "order": 3,
       "mode": 0,
@@ -321,6 +287,47 @@
       "widgets_values": [
         "fp16"
       ]
+    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": [
+        1138,
+        725
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 37
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 38
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            51
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      }
     }
   ],
   "links": [
@@ -393,10 +400,10 @@
   "config": {},
   "extra": {
     "ds": {
-      "scale": 0.6830134553650706,
+      "scale": 0.9090909090909092,
       "offset": [
-        359.4381777891929,
-        334.95283678425216
+        12.99028921497383,
+        38.21608107136124
       ]
     }
   },
diff --git a/nodes.py b/nodes.py
index 584fc17..f562580 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2,7 +2,7 @@ import os
 import torch
 import folder_paths
 import comfy.model_management as mm
-
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
 from .pipeline_cogvideox import CogVideoXPipeline
 
 import logging
@@ -54,11 +54,11 @@ class DownloadAndLoadCogVideoModel:
             )
 
         pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
-        
 
         pipeline = {
             "pipe": pipe,
-            "dtype": dtype
+            "dtype": dtype,
+            "base_path": base_path
         }
 
         return (pipeline,)
@@ -115,11 +115,15 @@ class CogVideoTextEncode:
     CATEGORY = "CogVideoWrapper"
 
     def process(self, clip, prompt):
+        load_device = mm.text_encoder_device()
+        offload_device = mm.text_encoder_offload_device()
         clip.tokenizer.t5xxl.pad_to_max_length = True
         clip.tokenizer.t5xxl.max_length = 226
+        clip.cond_stage_model.to(load_device)
         tokens = clip.tokenize(prompt, return_word_ids=True)
 
         embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
+        clip.cond_stage_model.to(offload_device)
 
         return (embeds, )
     
@@ -194,6 +198,7 @@ class CogVideoSampler:
                 "steps": ("INT", {"default": 25, "min": 1}),
                 "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                 "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+                "scheduler": (["DDIM", "DPM"],),
             },
             "optional": {
                 "samples": ("LATENT", ),
@@ -206,16 +211,22 @@ class CogVideoSampler:
     FUNCTION = "process"
     CATEGORY = "CogVideoWrapper"
 
-    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, samples=None, denoise_strength=1.0):
+    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
         mm.soft_empty_cache()
         device = mm.get_torch_device()
         offload_device = mm.unet_offload_device()
         pipe = pipeline["pipe"]
         dtype = pipeline["dtype"]
+        base_path = pipeline["base_path"]
 
         pipe.transformer.to(device)
         generator = torch.Generator(device=device).manual_seed(seed)
 
+        if scheduler == "DDIM":
+            pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
+        elif scheduler == "DPM":
+            pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
+
         latents = pipeline["pipe"](
             num_inference_steps=steps,
             height = height,
@@ -227,7 +238,6 @@ class CogVideoSampler:
             denoise_strength=denoise_strength,
             prompt_embeds=positive.to(dtype).to(device),
             negative_prompt_embeds=negative.to(dtype).to(device),
-            #negative_prompt_embeds=torch.zeros_like(embeds),
             generator=generator,
             output_type="latents",
             device=device
@@ -264,11 +274,10 @@ class CogVideoDecode:
         if "num_frames" in pipeline:
             num_frames = pipeline["num_frames"]
             fps = pipeline["fps"]
-
-            
         else:
             num_frames = latents.shape[2]
             fps = 8
+
         num_seconds = num_frames // fps
         latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
         latents = 1 / vae.config.scaling_factor * latents
@@ -278,17 +287,14 @@ class CogVideoDecode:
             # Whether or not to clear fake context parallel cache
             fake_cp = i + 1 < num_seconds
             start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
-
             current_frames = vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
             frames.append(current_frames)
+            mm.soft_empty_cache()
         vae.to(offload_device)
 
         frames = torch.cat(frames, dim=2)
-        print(frames.min(), frames.max())
         video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
-        print(video.shape)
         video = video[0].permute(0, 2, 3, 1).cpu().float()
-        print(video.min(), video.max())
 
         return (video,)
 
diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py
index 4383322..7faa0d3 100644
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@@ -222,22 +222,6 @@ class CogVideoXPipeline(DiffusionPipeline):
         latents = latents * self.scheduler.init_noise_sigma
         return latents, timesteps
 
-    def decode_latents(self, latents: torch.Tensor, num_seconds: int):
-        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
-        latents = 1 / self.vae.config.scaling_factor * latents
-
-        frames = []
-        for i in range(num_seconds):
-            # Whether or not to clear fake context parallel cache
-            fake_cp = i + 1 < num_seconds
-            start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
-
-            current_frames = self.vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
-            frames.append(current_frames)
-
-        frames = torch.cat(frames, dim=2)
-        return frames
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -534,17 +518,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                     progress_bar.update()
                     comfy_pbar.update(1)
 
-        if not output_type == "latents":
-            video = self.decode_latents(latents, num_frames // fps)
-            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
-        else:
-            video = latents
-            print(video.shape)
-
         # Offload all models
         self.maybe_free_model_hooks()
 
-        if not return_dict:
-            return (video,)
         return latents
-        #return CogVideoXPipelineOutput(frames=video)