From bbfaee3adb0773888c008f595ad36ff981cac462 Mon Sep 17 00:00:00 2001
From: kijai <40791699+kijai@users.noreply.github.com>
Date: Wed, 7 Aug 2024 17:46:04 +0300
Subject: [PATCH] temporal tiling for longer outputs

---
 examples/cogvideo_long_01.json                | 475 ++++++++++++
 .../cogvideo_vid2vid_test_example_01.json     | 725 +++++++++---------
 examples/example_01.json                      | 156 ++--
 nodes.py                                      |  27 +-
 pipeline_cogvideox.py                         | 137 +++-
 5 files changed, 1018 insertions(+), 502 deletions(-)
 create mode 100644 examples/cogvideo_long_01.json

diff --git a/examples/cogvideo_long_01.json b/examples/cogvideo_long_01.json
new file mode 100644
index 0000000..fb2b920
--- /dev/null
+++ b/examples/cogvideo_long_01.json
@@ -0,0 +1,475 @@
+{
+  "last_node_id": 33,
+  "last_link_id": 60,
+  "nodes": [
+    {
+      "id": 30,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        500,
+        308
+      ],
+      "size": {
+        "0": 474.8450012207031,
+        "1": 164.7423553466797
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 54
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            55
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
+      ]
+    },
+    {
+      "id": 20,
+      "type": "CLIPLoader",
+      "pos": [
+        -59,
+        397
+      ],
+      "size": {
+        "0": 451.30548095703125,
+        "1": 82
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            54,
+            56
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
+        "sd3"
+      ]
+    },
+    {
+      "id": 31,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        503,
+        521
+      ],
+      "size": {
+        "0": 463.01251220703125,
+        "1": 98.10446166992188
+      },
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 56
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            57
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        ""
+      ]
+    },
+    {
+      "id": 1,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": [
+        649,
+        182
+      ],
+      "size": {
+        "0": 315,
+        "1": 58
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            36
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "bf16"
+      ]
+    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": [
+        1140,
+        783
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 37
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 38
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            59
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      }
+    },
+    {
+      "id": 33,
+      "type": "GetImageSizeAndCount",
+      "pos": [
+        1189,
+        134
+      ],
+      "size": {
+        "0": 210,
+        "1": 86
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 59
+        }
+      ],
+      "outputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "links": [
+            60
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "720 width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "480 height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "122 count",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GetImageSizeAndCount"
+      }
+    },
+    {
+      "id": 22,
+      "type": "CogVideoSampler",
+      "pos": [
+        1041,
+        342
+      ],
+      "size": {
+        "0": 315,
+        "1": 382
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 36
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 55,
+          "slot_index": 1
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 57
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            37
+          ],
+          "shape": 3
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            38
+          ],
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoSampler"
+      },
+      "widgets_values": [
+        480,
+        720,
+        128,
+        8,
+        25,
+        6,
+        806286757407563,
+        "fixed",
+        "DDIM",
+        48,
+        12,
+        1
+      ]
+    },
+    {
+      "id": 32,
+      "type": "VHS_VideoCombine",
+      "pos": [
+        1439,
+        122
+      ],
+      "size": [
+        563.3333740234375,
+        686.2222493489583
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 60,
+          "slot_index": 0
+        },
+        {
+          "name": "audio",
+          "type": "VHS_AUDIO",
+          "link": null
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 8,
+        "loop_count": 0,
+        "filename_prefix": "AnimateDiff",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": false,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "AnimateDiff_00002.mp4",
+            "subfolder": "",
+            "type": "temp",
+            "format": "video/h264-mp4",
+            "frame_rate": 8
+          }
+        }
+      }
+    }
+  ],
+  "links": [
+    [
+      36,
+      1,
+      0,
+      22,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      37,
+      22,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      38,
+      22,
+      1,
+      11,
+      1,
+      "LATENT"
+    ],
+    [
+      54,
+      20,
+      0,
+      30,
+      0,
+      "CLIP"
+    ],
+    [
+      55,
+      30,
+      0,
+      22,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      56,
+      20,
+      0,
+      31,
+      0,
+      "CLIP"
+    ],
+    [
+      57,
+      31,
+      0,
+      22,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      59,
+      11,
+      0,
+      33,
+      0,
+      "IMAGE"
+    ],
+    [
+      60,
+      33,
+      0,
+      32,
+      0,
+      "IMAGE"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.9090909090909091,
+      "offset": [
+        49.8551278885073,
+        87.4070604693312
+      ]
+    }
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/examples/cogvideo_vid2vid_test_example_01.json b/examples/cogvideo_vid2vid_test_example_01.json
index 2416c9d..1746fda 100644
--- a/examples/cogvideo_vid2vid_test_example_01.json
+++ b/examples/cogvideo_vid2vid_test_example_01.json
@@ -1,6 +1,6 @@
 {
-  "last_node_id": 69,
-  "last_link_id": 176,
+  "last_node_id": 70,
+  "last_link_id": 181,
   "nodes": [
     {
       "id": 20,
@@ -48,7 +48,7 @@
         "1": 86
       },
       "flags": {},
-      "order": 13,
+      "order": 12,
       "mode": 0,
       "inputs": [
         {
@@ -81,7 +81,7 @@
           "shape": 3
         },
         {
-          "name": "25 count",
+          "name": "26 count",
           "type": "INT",
           "links": [
             121
@@ -166,47 +166,6 @@
         "bf16"
       ]
     },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1201,
-        684
-      ],
-      "size": {
-        "0": 210,
-        "1": 46
-      },
-      "flags": {},
-      "order": 12,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 166
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 167
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            118
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      }
-    },
     {
       "id": 41,
       "type": "ImageResizeKJ",
@@ -225,7 +184,7 @@
         {
           "name": "image",
           "type": "IMAGE",
-          "link": 128
+          "link": 180
         },
         {
           "name": "get_image_size",
@@ -328,124 +287,6 @@
         "Node name for S&R": "CogVideoImageEncode"
       }
     },
-    {
-      "id": 57,
-      "type": "GetImageSizeAndCount",
-      "pos": [
-        603,
-        -65
-      ],
-      "size": [
-        202.21431350127853,
-        99.2360176040001
-      ],
-      "flags": {},
-      "order": 8,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 126,
-          "slot_index": 0
-        }
-      ],
-      "outputs": [
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "links": [
-            129,
-            136
-          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "720 width",
-          "type": "INT",
-          "links": [
-            165
-          ],
-          "shape": 3,
-          "slot_index": 1
-        },
-        {
-          "name": "480 height",
-          "type": "INT",
-          "links": [
-            164
-          ],
-          "shape": 3,
-          "slot_index": 2
-        },
-        {
-          "name": "28 count",
-          "type": "INT",
-          "links": [
-            171,
-            173
-          ],
-          "shape": 3,
-          "slot_index": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "GetImageSizeAndCount"
-      }
-    },
-    {
-      "id": 67,
-      "type": "SimpleMath+",
-      "pos": [
-        665,
-        98
-      ],
-      "size": {
-        "0": 315,
-        "1": 78
-      },
-      "flags": {
-        "collapsed": true
-      },
-      "order": 10,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "a",
-          "type": "INT,FLOAT",
-          "link": 173
-        },
-        {
-          "name": "b",
-          "type": "INT,FLOAT",
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "name": "INT",
-          "type": "INT",
-          "links": [
-            174
-          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "FLOAT",
-          "type": "FLOAT",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "SimpleMath+"
-      },
-      "widgets_values": [
-        "a - 4"
-      ]
-    },
     {
       "id": 59,
       "type": "GetImageRangeFromBatch",
@@ -460,7 +301,7 @@
       "flags": {
         "collapsed": true
       },
-      "order": 15,
+      "order": 14,
       "mode": 0,
       "inputs": [
         {
@@ -520,7 +361,7 @@
         "1": 102
       },
       "flags": {},
-      "order": 16,
+      "order": 15,
       "mode": 0,
       "inputs": [
         {
@@ -567,7 +408,7 @@
       "flags": {
         "collapsed": true
       },
-      "order": 14,
+      "order": 13,
       "mode": 0,
       "inputs": [
         {
@@ -605,184 +446,6 @@
         "a - b"
       ]
     },
-    {
-      "id": 45,
-      "type": "VHS_LoadVideo",
-      "pos": [
-        -93,
-        -153
-      ],
-      "size": [
-        235.1999969482422,
-        371.5999984741211
-      ],
-      "flags": {},
-      "order": 6,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "meta_batch",
-          "type": "VHS_BatchManager",
-          "link": null
-        },
-        {
-          "name": "vae",
-          "type": "VAE",
-          "link": null
-        },
-        {
-          "name": "frame_load_cap",
-          "type": "INT",
-          "link": 176,
-          "widget": {
-            "name": "frame_load_cap"
-          }
-        }
-      ],
-      "outputs": [
-        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": [
-            128
-          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "frame_count",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "audio",
-          "type": "VHS_AUDIO",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "video_info",
-          "type": "VHS_VIDEOINFO",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "VHS_LoadVideo"
-      },
-      "widgets_values": {
-        "video": "jeep.mp4",
-        "force_rate": 0,
-        "force_size": "Disabled",
-        "custom_width": 512,
-        "custom_height": 512,
-        "frame_load_cap": 20,
-        "skip_first_frames": 0,
-        "select_every_nth": 1,
-        "choose video to upload": "image",
-        "videopreview": {
-          "hidden": false,
-          "paused": false,
-          "params": {
-            "frame_load_cap": 20,
-            "skip_first_frames": 0,
-            "force_rate": 0,
-            "filename": "jeep.mp4",
-            "type": "input",
-            "format": "video/mp4",
-            "select_every_nth": 1
-          }
-        }
-      }
-    },
-    {
-      "id": 68,
-      "type": "SimpleMath+",
-      "pos": [
-        -75,
-        -197
-      ],
-      "size": {
-        "0": 315,
-        "1": 78
-      },
-      "flags": {
-        "collapsed": true
-      },
-      "order": 5,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "a",
-          "type": "INT,FLOAT",
-          "link": 175,
-          "slot_index": 0
-        },
-        {
-          "name": "b",
-          "type": "INT,FLOAT",
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "name": "INT",
-          "type": "INT",
-          "links": [
-            176
-          ],
-          "shape": 3,
-          "slot_index": 0
-        },
-        {
-          "name": "FLOAT",
-          "type": "FLOAT",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "SimpleMath+"
-      },
-      "widgets_values": [
-        "a + 4"
-      ]
-    },
-    {
-      "id": 69,
-      "type": "INTConstant",
-      "pos": [
-        -90,
-        -305
-      ],
-      "size": [
-        200,
-        58
-      ],
-      "flags": {},
-      "order": 2,
-      "mode": 0,
-      "outputs": [
-        {
-          "name": "value",
-          "type": "INT",
-          "links": [
-            175
-          ],
-          "shape": 3
-        }
-      ],
-      "title": "Frames to load",
-      "properties": {
-        "Node name for S&R": "INTConstant"
-      },
-      "widgets_values": [
-        24
-      ],
-      "color": "#1b4669",
-      "bgcolor": "#29699c"
-    },
     {
       "id": 47,
       "type": "VHS_VideoCombine",
@@ -795,7 +458,7 @@
         711.3333333333333
       ],
       "flags": {},
-      "order": 17,
+      "order": 16,
       "mode": 0,
       "inputs": [
         {
@@ -854,6 +517,47 @@
         }
       }
     },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": [
+        1224,
+        737
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 166
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 167
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            118
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      }
+    },
     {
       "id": 30,
       "type": "CogVideoTextEncode",
@@ -890,9 +594,259 @@
         "Node name for S&R": "CogVideoTextEncode"
       },
       "widgets_values": [
-        "A high-definition nature video showcasing a vibrant red panda as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The red panda’s fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness, occasionally pausing to drink from the water or look around curiously."
+        "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness."
       ]
     },
+    {
+      "id": 57,
+      "type": "GetImageSizeAndCount",
+      "pos": [
+        603,
+        -65
+      ],
+      "size": {
+        "0": 202.2143096923828,
+        "1": 99.23601531982422
+      },
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 126,
+          "slot_index": 0
+        }
+      ],
+      "outputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "links": [
+            129,
+            136
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "720 width",
+          "type": "INT",
+          "links": [
+            165
+          ],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "480 height",
+          "type": "INT",
+          "links": [
+            164
+          ],
+          "shape": 3,
+          "slot_index": 2
+        },
+        {
+          "name": "32 count",
+          "type": "INT",
+          "links": [
+            171,
+            178,
+            181
+          ],
+          "shape": 3,
+          "slot_index": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GetImageSizeAndCount"
+      }
+    },
+    {
+      "id": 45,
+      "type": "VHS_LoadVideo",
+      "pos": [
+        -93,
+        -153
+      ],
+      "size": [
+        235.1999969482422,
+        359.5999984741211
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null
+        },
+        {
+          "name": "frame_load_cap",
+          "type": "INT",
+          "link": 177,
+          "widget": {
+            "name": "frame_load_cap"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            179
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "frame_count",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "audio",
+          "type": "VHS_AUDIO",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "video_info",
+          "type": "VHS_VIDEOINFO",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_LoadVideo"
+      },
+      "widgets_values": {
+        "video": "jeep.mp4",
+        "force_rate": 0,
+        "force_size": "Disabled",
+        "custom_width": 512,
+        "custom_height": 512,
+        "frame_load_cap": 20,
+        "skip_first_frames": 0,
+        "select_every_nth": 1,
+        "choose video to upload": "image",
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "frame_load_cap": 20,
+            "skip_first_frames": 0,
+            "force_rate": 0,
+            "filename": "jeep.mp4",
+            "type": "input",
+            "format": "video/mp4",
+            "select_every_nth": 1
+          }
+        }
+      }
+    },
+    {
+      "id": 70,
+      "type": "GetImageSizeAndCount",
+      "pos": [
+        214,
+        -234
+      ],
+      "size": {
+        "0": 202.2143096923828,
+        "1": 99.23601531982422
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 179,
+          "slot_index": 0
+        }
+      ],
+      "outputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "links": [
+            180
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "512 width",
+          "type": "INT",
+          "links": [],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "256 height",
+          "type": "INT",
+          "links": [],
+          "shape": 3,
+          "slot_index": 2
+        },
+        {
+          "name": "32 count",
+          "type": "INT",
+          "links": [],
+          "shape": 3,
+          "slot_index": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GetImageSizeAndCount"
+      }
+    },
+    {
+      "id": 69,
+      "type": "INTConstant",
+      "pos": [
+        -90,
+        -305
+      ],
+      "size": {
+        "0": 210,
+        "1": 58
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "value",
+          "type": "INT",
+          "links": [
+            177
+          ],
+          "shape": 3
+        }
+      ],
+      "title": "Frames to load",
+      "properties": {
+        "Node name for S&R": "INTConstant"
+      },
+      "widgets_values": [
+        32
+      ],
+      "color": "#1b4669",
+      "bgcolor": "#29699c"
+    },
     {
       "id": 64,
       "type": "CogVideoSampler",
@@ -902,10 +856,10 @@
       ],
       "size": [
         315,
-        342
+        370
       ],
       "flags": {},
-      "order": 11,
+      "order": 10,
       "mode": 0,
       "inputs": [
         {
@@ -947,10 +901,19 @@
         {
           "name": "num_frames",
           "type": "INT",
-          "link": 174,
+          "link": 178,
           "widget": {
             "name": "num_frames"
           }
+        },
+        {
+          "name": "t_tile_length",
+          "type": "INT",
+          "link": 181,
+          "widget": {
+            "name": "t_tile_length"
+          },
+          "slot_index": 7
         }
       ],
       "outputs": [
@@ -979,12 +942,14 @@
         720,
         16,
         8,
-        50,
+        25,
         9,
-        12,
+        13,
         "fixed",
-        "DPM",
-        0.81
+        "DDIM",
+        32,
+        2,
+        0.8
       ]
     }
   ],
@@ -1037,14 +1002,6 @@
       0,
       "IMAGE"
     ],
-    [
-      128,
-      45,
-      0,
-      41,
-      0,
-      "IMAGE"
-    ],
     [
       129,
       57,
@@ -1166,35 +1123,43 @@
       "LATENT"
     ],
     [
-      173,
-      57,
-      3,
-      67,
+      177,
+      69,
       0,
-      "INT,FLOAT"
+      45,
+      2,
+      "INT"
     ],
     [
-      174,
-      67,
-      0,
+      178,
+      57,
+      3,
       64,
       6,
       "INT"
     ],
     [
-      175,
-      69,
+      179,
+      45,
       0,
-      68,
+      70,
       0,
-      "INT,FLOAT"
+      "IMAGE"
     ],
     [
-      176,
-      68,
+      180,
+      70,
       0,
-      45,
-      2,
+      41,
+      0,
+      "IMAGE"
+    ],
+    [
+      181,
+      57,
+      3,
+      64,
+      7,
       "INT"
     ]
   ],
@@ -1204,8 +1169,8 @@
     "ds": {
       "scale": 0.7513148009015777,
       "offset": [
-        281.39770788130244,
-        559.6153930987157
+        177.74090581831425,
+        461.56507330501444
       ]
     }
   },
diff --git a/examples/example_01.json b/examples/example_01.json
index 29a854f..a131b8c 100644
--- a/examples/example_01.json
+++ b/examples/example_01.json
@@ -2,77 +2,6 @@
   "last_node_id": 31,
   "last_link_id": 57,
   "nodes": [
-    {
-      "id": 22,
-      "type": "CogVideoSampler",
-      "pos": [
-        1041,
-        342
-      ],
-      "size": {
-        "0": 315,
-        "1": 334
-      },
-      "flags": {},
-      "order": 4,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 36
-        },
-        {
-          "name": "positive",
-          "type": "CONDITIONING",
-          "link": 55,
-          "slot_index": 1
-        },
-        {
-          "name": "negative",
-          "type": "CONDITIONING",
-          "link": 57
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
-          "links": [
-            37
-          ],
-          "shape": 3
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "links": [
-            38
-          ],
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoSampler"
-      },
-      "widgets_values": [
-        480,
-        720,
-        16,
-        8,
-        25,
-        6,
-        806286757407561,
-        "fixed",
-        "DDIM",
-        1
-      ]
-    },
     {
       "id": 28,
       "type": "VHS_VideoCombine",
@@ -82,7 +11,7 @@
       ],
       "size": [
         667.752197265625,
-        755.8347981770833
+        310
       ],
       "flags": {},
       "order": 6,
@@ -292,8 +221,8 @@
       "id": 11,
       "type": "CogVideoDecode",
       "pos": [
-        1138,
-        725
+        1140,
+        783
       ],
       "size": {
         "0": 210,
@@ -328,6 +257,79 @@
       "properties": {
         "Node name for S&R": "CogVideoDecode"
       }
+    },
+    {
+      "id": 22,
+      "type": "CogVideoSampler",
+      "pos": [
+        1041,
+        342
+      ],
+      "size": {
+        "0": 315,
+        "1": 382
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 36
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 55,
+          "slot_index": 1
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 57
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            37
+          ],
+          "shape": 3
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            38
+          ],
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoSampler"
+      },
+      "widgets_values": [
+        480,
+        720,
+        16,
+        8,
+        25,
+        6,
+        806286757407561,
+        "fixed",
+        "DDIM",
+        16,
+        2,
+        1
+      ]
     }
   ],
   "links": [
@@ -400,10 +402,10 @@
   "config": {},
   "extra": {
     "ds": {
-      "scale": 0.9090909090909092,
+      "scale": 0.8264462809917356,
       "offset": [
-        12.99028921497383,
-        38.21608107136124
+        253.92700064075518,
+        186.82608107136124
       ]
     }
   },
diff --git a/nodes.py b/nodes.py
index dc3bf73..7bd8186 100644
--- a/nodes.py
+++ b/nodes.py
@@ -153,17 +153,17 @@ class CogVideoImageEncode:
         vae = pipeline["pipe"].vae
         vae.to(device)
   
-        image = image * 2.0 - 1.0
-        image = image.to(vae.dtype).to(device)
-        image = image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
-        B, C, T, H, W = image.shape
+        input_image = image.clone() * 2.0 - 1.0
+        input_image = input_image.to(vae.dtype).to(device)
+        input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
+        B, C, T, H, W = input_image.shape
         chunk_size = 16
         latents_list = []
         # Loop through the temporal dimension in chunks of 16
         for i in range(0, T, chunk_size):
             # Get the chunk of 16 frames (or remaining frames if less than 16 are left)
             end_index = min(i + chunk_size, T)
-            image_chunk = image[:, :, i:end_index, :, :]  # Shape: [B, C, chunk_size, H, W]
+            image_chunk = input_image[:, :, i:end_index, :, :]  # Shape: [B, C, chunk_size, H, W]
 
             # Encode the chunk of images
             latents = vae.encode(image_chunk)
@@ -179,6 +179,7 @@ class CogVideoImageEncode:
             latents = vae.config.scaling_factor * latents
             latents = latents.permute(0, 2, 1, 3, 4)  # B, T_chunk, C, H, W
             latents_list.append(latents)
+        vae.clear_fake_context_parallel_cache()
 
         # Concatenate all the chunks along the temporal dimension
         final_latents = torch.cat(latents_list, dim=1)
@@ -198,12 +199,14 @@ class CogVideoSampler:
                 "negative": ("CONDITIONING", ),
                 "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
                 "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
-                "num_frames": ("INT", {"default": 48, "min": 8, "max": 100, "step": 8}),
+                "num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}),
                 "fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
                 "steps": ("INT", {"default": 25, "min": 1}),
                 "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                 "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
                 "scheduler": (["DDIM", "DPM"],),
+                "t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}),
+                "t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}),
             },
             "optional": {
                 "samples": ("LATENT", ),
@@ -216,14 +219,20 @@ class CogVideoSampler:
     FUNCTION = "process"
     CATEGORY = "CogVideoWrapper"
 
-    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
+    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
         mm.soft_empty_cache()
+
+        assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
+        assert t_tile_length <= num_frames, "t_tile_length must be equal or less than num_frames"
+        t_tile_length = t_tile_length // 4
+        t_tile_overlap = t_tile_overlap // 4
+
         device = mm.get_torch_device()
         offload_device = mm.unet_offload_device()
         pipe = pipeline["pipe"]
         dtype = pipeline["dtype"]
         base_path = pipeline["base_path"]
-
+        
         pipe.transformer.to(device)
         generator = torch.Generator(device=device).manual_seed(seed)
 
@@ -237,6 +246,8 @@ class CogVideoSampler:
             height = height,
             width = width,
             num_frames = num_frames,
+            t_tile_length = t_tile_length,
+            t_tile_overlap = t_tile_overlap,
             fps = fps,
             guidance_scale=cfg,
             latents=samples["samples"] if samples is not None else None,
diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py
index f880b0e..b36846a 100644
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@@ -218,6 +218,16 @@ class CogVideoXPipeline(DiffusionPipeline):
             self.scheduler.set_begin_index(t_start * self.scheduler.order)
 
         return timesteps.to(device), num_inference_steps - t_start
+    
+    def _gaussian_weights(self, t_tile_length, t_batch_size):
+        from numpy import pi, exp, sqrt
+
+        var = 0.01
+        midpoint = (t_tile_length - 1) / 2  # -1 because index goes from 0 to latent_width - 1
+        t_probs = [exp(-(t-midpoint)*(t-midpoint)/(t_tile_length*t_tile_length)/(2*var)) / sqrt(2*pi*var) for t in range(t_tile_length)]
+        weights = torch.tensor(t_probs)
+        weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
+        return weights
 
     @property
     def guidance_scale(self):
@@ -244,6 +254,8 @@ class CogVideoXPipeline(DiffusionPipeline):
         height: int = 480,
         width: int = 720,
         num_frames: int = 48,
+        t_tile_length: int = 12,
+        t_tile_overlap: int = 4,
         fps: int = 8,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
@@ -301,9 +313,9 @@ class CogVideoXPipeline(DiffusionPipeline):
                 argument.
         """
 
-        assert (
-            num_frames <= 48 and num_frames % fps == 0 and fps == 8
-        ), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
+        #assert (
+        #    num_frames <= 48 and num_frames % fps == 0 and fps == 8
+        #), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
 
         height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
         width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
@@ -337,7 +349,10 @@ class CogVideoXPipeline(DiffusionPipeline):
 
         # 5. Prepare latents.
         latent_channels = self.transformer.config.in_channels
-        num_frames += 1
+
+        if latents is None and num_frames == t_tile_length:
+            num_frames += 1
+
         latents, timesteps = self.prepare_latents(
             batch_size * num_videos_per_prompt,
             latent_channels,
@@ -356,6 +371,9 @@ class CogVideoXPipeline(DiffusionPipeline):
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
+        print("latents.shape", latents.shape)
+        print("latents.device", latents.device)
         # 7. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         comfy_pbar = ProgressBar(num_inference_steps)
@@ -365,45 +383,90 @@ class CogVideoXPipeline(DiffusionPipeline):
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
+                
+                #temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
+                # =====================================================
+                grid_ts = 0
+                cur_t = 0
+                while cur_t < latents.shape[1]:
+                    cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length
+                    grid_ts += 1
 
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                all_t = latents.shape[1]
+                latents_all_list = []
+                # =====================================================
 
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latent_model_input.shape[0])
+                for t_i in range(grid_ts):
+                    if t_i < grid_ts - 1:
+                        ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
+                    if t_i == grid_ts - 1:
+                        ofs_t = all_t - t_tile_length
 
-                # predict noise model_output
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    encoder_hidden_states=prompt_embeds,
-                    timestep=timestep,
-                    return_dict=False,
-                )[0]
-                noise_pred = noise_pred.float()
+                    input_start_t = ofs_t
+                    input_end_t = ofs_t + t_tile_length
 
-                # perform guidance
-                # self._guidance_scale = 1 + guidance_scale * (
-                #     (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
-                # )
-                # print(self._guidance_scale)
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    #latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    #latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                # compute the previous noisy sample x_t -> x_t-1
-                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-                else:
-                    latents, old_pred_original_sample = self.scheduler.step(
-                        noise_pred,
-                        old_pred_original_sample,
-                        t,
-                        timesteps[i - 1] if i > 0 else None,
-                        latents,
-                        **extra_step_kwargs,
+                    latents_tile = latents[:, input_start_t:input_end_t,:, :, :]
+                    latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile
+                    latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t)
+
+                    #t_input = t[None].to(device)
+                    t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            
+                    # predict noise model_output
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input_tile,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep=t_input,
                         return_dict=False,
-                    )
-                latents = latents.to(prompt_embeds.dtype)
+                    )[0]
+                    noise_pred = noise_pred.float()                  
+
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                        latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
+                    else:
+                        raise NotImplementedError("DPM is not supported with temporal tiling")
+                    # else:
+                    #     latents_tile, old_pred_original_sample = self.scheduler.step(
+                    #         noise_pred,
+                    #         old_pred_original_sample,
+                    #         t,
+                    #         t_input[t_i - 1] if t_i > 0 else None,
+                    #         latents_tile,
+                    #         **extra_step_kwargs,
+                    #         return_dict=False,
+                    #     )
+        
+                    latents_all_list.append(latents_tile)
+
+                # ==========================================
+                latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
+                contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
+                # Add each tile contribution to overall latents
+                for t_i in range(grid_ts):
+                    if t_i < grid_ts - 1:
+                        ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
+                    if t_i == grid_ts - 1:
+                        ofs_t = all_t - t_tile_length
+
+                    input_start_t = ofs_t
+                    input_end_t = ofs_t + t_tile_length
+
+                    latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights
+                    contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights
+                
+                latents_all /= contributors
+
+                latents = latents_all
+                # ==========================================
+
 
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()