temporal tiling for longer outputs

2026-07-02 15:37:04 +08:00 · 2024-08-07 17:46:04 +03:00 · 2024-08-07 17:46:04 +03:00 · bbfaee3adb
commit bbfaee3adb
parent b602a015bb
5 changed files with 1018 additions and 502 deletions
--- a/examples/cogvideo_long_01.json
+++ b/examples/cogvideo_long_01.json
@ -0,0 +1,475 @@
 {
  "last_node_id": 33,
  "last_link_id": 60,
  "nodes": [
    {
      "id": 30,
      "type": "CogVideoTextEncode",
      "pos": [
        500,
        308
      ],
      "size": {
        "0": 474.8450012207031,
        "1": 164.7423553466797
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 54
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            55
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
      ]
    },
    {
      "id": 20,
      "type": "CLIPLoader",
      "pos": [
        -59,
        397
      ],
      "size": {
        "0": 451.30548095703125,
        "1": 82
      },
      "flags": {},
      "order": 0,
      "mode": 0,
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            54,
            56
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CLIPLoader"
      },
      "widgets_values": [
        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
        "sd3"
      ]
    },
    {
      "id": 31,
      "type": "CogVideoTextEncode",
      "pos": [
        503,
        521
      ],
      "size": {
        "0": 463.01251220703125,
        "1": 98.10446166992188
      },
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 56
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            57
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 1,
      "type": "DownloadAndLoadCogVideoModel",
      "pos": [
        649,
        182
      ],
      "size": {
        "0": 315,
        "1": 58
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            36
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoModel"
      },
      "widgets_values": [
        "bf16"
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
        1140,
        783
      ],
      "size": {
        "0": 210,
        "1": 46
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 37
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 38
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            59
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    },
    {
      "id": 33,
      "type": "GetImageSizeAndCount",
      "pos": [
        1189,
        134
      ],
      "size": {
        "0": 210,
        "1": 86
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 59
        }
      ],
      "outputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "links": [
            60
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "720 width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "480 height",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "122 count",
          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
    {
      "id": 22,
      "type": "CogVideoSampler",
      "pos": [
        1041,
        342
      ],
      "size": {
        "0": 315,
        "1": 382
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 36
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 55,
          "slot_index": 1
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 57
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            37
          ],
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            38
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoSampler"
      },
      "widgets_values": [
        480,
        720,
        128,
        8,
        25,
        6,
        806286757407563,
        "fixed",
        "DDIM",
        48,
        12,
        1
      ]
    },
    {
      "id": 32,
      "type": "VHS_VideoCombine",
      "pos": [
        1439,
        122
      ],
      "size": [
        563.3333740234375,
        686.2222493489583
      ],
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 60,
          "slot_index": 0
        },
        {
          "name": "audio",
          "type": "VHS_AUDIO",
          "link": null
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_VideoCombine"
      },
      "widgets_values": {
        "frame_rate": 8,
        "loop_count": 0,
        "filename_prefix": "AnimateDiff",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
        "save_output": false,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "AnimateDiff_00002.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
            "frame_rate": 8
          }
        }
      }
    }
  ],
  "links": [
    [
      36,
      1,
      0,
      22,
      0,
      "COGVIDEOPIPE"
    ],
    [
      37,
      22,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      38,
      22,
      1,
      11,
      1,
      "LATENT"
    ],
    [
      54,
      20,
      0,
      30,
      0,
      "CLIP"
    ],
    [
      55,
      30,
      0,
      22,
      1,
      "CONDITIONING"
    ],
    [
      56,
      20,
      0,
      31,
      0,
      "CLIP"
    ],
    [
      57,
      31,
      0,
      22,
      2,
      "CONDITIONING"
    ],
    [
      59,
      11,
      0,
      33,
      0,
      "IMAGE"
    ],
    [
      60,
      33,
      0,
      32,
      0,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.9090909090909091,
      "offset": [
        49.8551278885073,
        87.4070604693312
      ]
    }
  },
  "version": 0.4
 }
--- a/examples/cogvideo_vid2vid_test_example_01.json
+++ b/examples/cogvideo_vid2vid_test_example_01.json
@ -1,6 +1,6 @@
 {
-  "last_node_id": 69,
+  "last_node_id": 70,
-  "last_link_id": 176,
+  "last_link_id": 181,
  "nodes": [
    {
      "id": 20,
@ -48,7 +48,7 @@
        "1": 86
      },
      "flags": {},
-      "order": 13,
+      "order": 12,
      "mode": 0,
      "inputs": [
        {
@ -81,7 +81,7 @@
          "shape": 3
        },
        {
-          "name": "25 count",
+          "name": "26 count",
          "type": "INT",
          "links": [
            121
@ -166,47 +166,6 @@
        "bf16"
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
        1201,
        684
      ],
      "size": {
        "0": 210,
        "1": 46
      },
      "flags": {},
      "order": 12,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 166
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 167
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            118
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    },
    {
      "id": 41,
      "type": "ImageResizeKJ",
@ -225,7 +184,7 @@
        {
          "name": "image",
          "type": "IMAGE",
-          "link": 128
+          "link": 180
        },
        {
          "name": "get_image_size",
@ -328,124 +287,6 @@
        "Node name for S&R": "CogVideoImageEncode"
      }
    },
    {
      "id": 57,
      "type": "GetImageSizeAndCount",
      "pos": [
        603,
        -65
      ],
      "size": [
        202.21431350127853,
        99.2360176040001
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 126,
          "slot_index": 0
        }
      ],
      "outputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "links": [
            129,
            136
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "720 width",
          "type": "INT",
          "links": [
            165
          ],
          "shape": 3,
          "slot_index": 1
        },
        {
          "name": "480 height",
          "type": "INT",
          "links": [
            164
          ],
          "shape": 3,
          "slot_index": 2
        },
        {
          "name": "28 count",
          "type": "INT",
          "links": [
            171,
            173
          ],
          "shape": 3,
          "slot_index": 3
        }
      ],
      "properties": {
        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
    {
      "id": 67,
      "type": "SimpleMath+",
      "pos": [
        665,
        98
      ],
      "size": {
        "0": 315,
        "1": 78
      },
      "flags": {
        "collapsed": true
      },
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "a",
          "type": "INT,FLOAT",
          "link": 173
        },
        {
          "name": "b",
          "type": "INT,FLOAT",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "INT",
          "type": "INT",
          "links": [
            174
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "FLOAT",
          "type": "FLOAT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "SimpleMath+"
      },
      "widgets_values": [
        "a - 4"
      ]
    },
    {
      "id": 59,
      "type": "GetImageRangeFromBatch",
@ -460,7 +301,7 @@
      "flags": {
        "collapsed": true
      },
-      "order": 15,
+      "order": 14,
      "mode": 0,
      "inputs": [
        {
@ -520,7 +361,7 @@
        "1": 102
      },
      "flags": {},
-      "order": 16,
+      "order": 15,
      "mode": 0,
      "inputs": [
        {
@ -567,7 +408,7 @@
      "flags": {
        "collapsed": true
      },
-      "order": 14,
+      "order": 13,
      "mode": 0,
      "inputs": [
        {
@ -605,184 +446,6 @@
        "a - b"
      ]
    },
    {
      "id": 45,
      "type": "VHS_LoadVideo",
      "pos": [
        -93,
        -153
      ],
      "size": [
        235.1999969482422,
        371.5999984741211
      ],
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        },
        {
          "name": "frame_load_cap",
          "type": "INT",
          "link": 176,
          "widget": {
            "name": "frame_load_cap"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            128
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "audio",
          "type": "VHS_AUDIO",
          "links": null,
          "shape": 3
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "jeep.mp4",
        "force_rate": 0,
        "force_size": "Disabled",
        "custom_width": 512,
        "custom_height": 512,
        "frame_load_cap": 20,
        "skip_first_frames": 0,
        "select_every_nth": 1,
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "frame_load_cap": 20,
            "skip_first_frames": 0,
            "force_rate": 0,
            "filename": "jeep.mp4",
            "type": "input",
            "format": "video/mp4",
            "select_every_nth": 1
          }
        }
      }
    },
    {
      "id": 68,
      "type": "SimpleMath+",
      "pos": [
        -75,
        -197
      ],
      "size": {
        "0": 315,
        "1": 78
      },
      "flags": {
        "collapsed": true
      },
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "a",
          "type": "INT,FLOAT",
          "link": 175,
          "slot_index": 0
        },
        {
          "name": "b",
          "type": "INT,FLOAT",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "INT",
          "type": "INT",
          "links": [
            176
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "FLOAT",
          "type": "FLOAT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "SimpleMath+"
      },
      "widgets_values": [
        "a + 4"
      ]
    },
    {
      "id": 69,
      "type": "INTConstant",
      "pos": [
        -90,
        -305
      ],
      "size": [
        200,
        58
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "outputs": [
        {
          "name": "value",
          "type": "INT",
          "links": [
            175
          ],
          "shape": 3
        }
      ],
      "title": "Frames to load",
      "properties": {
        "Node name for S&R": "INTConstant"
      },
      "widgets_values": [
        24
      ],
      "color": "#1b4669",
      "bgcolor": "#29699c"
    },
    {
      "id": 47,
      "type": "VHS_VideoCombine",
@ -795,7 +458,7 @@
        711.3333333333333
      ],
      "flags": {},
-      "order": 17,
+      "order": 16,
      "mode": 0,
      "inputs": [
        {
@ -854,6 +517,47 @@
        }
      }
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
        1224,
        737
      ],
      "size": {
        "0": 210,
        "1": 46
      },
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 166
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 167
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            118
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
@ -890,9 +594,259 @@
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        "A high-definition nature video showcasing a vibrant red panda as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The red panda’s fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness, occasionally pausing to drink from the water or look around curiously."
+        "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness."
      ]
    },
    {
      "id": 57,
      "type": "GetImageSizeAndCount",
      "pos": [
        603,
        -65
      ],
      "size": {
        "0": 202.2143096923828,
        "1": 99.23601531982422
      },
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 126,
          "slot_index": 0
        }
      ],
      "outputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "links": [
            129,
            136
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "720 width",
          "type": "INT",
          "links": [
            165
          ],
          "shape": 3,
          "slot_index": 1
        },
        {
          "name": "480 height",
          "type": "INT",
          "links": [
            164
          ],
          "shape": 3,
          "slot_index": 2
        },
        {
          "name": "32 count",
          "type": "INT",
          "links": [
            171,
            178,
            181
          ],
          "shape": 3,
          "slot_index": 3
        }
      ],
      "properties": {
        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
    {
      "id": 45,
      "type": "VHS_LoadVideo",
      "pos": [
        -93,
        -153
      ],
      "size": [
        235.1999969482422,
        359.5999984741211
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        },
        {
          "name": "frame_load_cap",
          "type": "INT",
          "link": 177,
          "widget": {
            "name": "frame_load_cap"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            179
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "audio",
          "type": "VHS_AUDIO",
          "links": null,
          "shape": 3
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "jeep.mp4",
        "force_rate": 0,
        "force_size": "Disabled",
        "custom_width": 512,
        "custom_height": 512,
        "frame_load_cap": 20,
        "skip_first_frames": 0,
        "select_every_nth": 1,
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "frame_load_cap": 20,
            "skip_first_frames": 0,
            "force_rate": 0,
            "filename": "jeep.mp4",
            "type": "input",
            "format": "video/mp4",
            "select_every_nth": 1
          }
        }
      }
    },
    {
      "id": 70,
      "type": "GetImageSizeAndCount",
      "pos": [
        214,
        -234
      ],
      "size": {
        "0": 202.2143096923828,
        "1": 99.23601531982422
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 179,
          "slot_index": 0
        }
      ],
      "outputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "links": [
            180
          ],
          "shape": 3,
          "slot_index": 0
        },
        {
          "name": "512 width",
          "type": "INT",
          "links": [],
          "shape": 3,
          "slot_index": 1
        },
        {
          "name": "256 height",
          "type": "INT",
          "links": [],
          "shape": 3,
          "slot_index": 2
        },
        {
          "name": "32 count",
          "type": "INT",
          "links": [],
          "shape": 3,
          "slot_index": 3
        }
      ],
      "properties": {
        "Node name for S&R": "GetImageSizeAndCount"
      }
    },
    {
      "id": 69,
      "type": "INTConstant",
      "pos": [
        -90,
        -305
      ],
      "size": {
        "0": 210,
        "1": 58
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "outputs": [
        {
          "name": "value",
          "type": "INT",
          "links": [
            177
          ],
          "shape": 3
        }
      ],
      "title": "Frames to load",
      "properties": {
        "Node name for S&R": "INTConstant"
      },
      "widgets_values": [
        32
      ],
      "color": "#1b4669",
      "bgcolor": "#29699c"
    },
    {
      "id": 64,
      "type": "CogVideoSampler",
@ -902,10 +856,10 @@
      ],
      "size": [
        315,
-        342
+        370
      ],
      "flags": {},
-      "order": 11,
+      "order": 10,
      "mode": 0,
      "inputs": [
        {
@ -947,10 +901,19 @@
        {
          "name": "num_frames",
          "type": "INT",
-          "link": 174,
+          "link": 178,
          "widget": {
            "name": "num_frames"
          }
        },
        {
          "name": "t_tile_length",
          "type": "INT",
          "link": 181,
          "widget": {
            "name": "t_tile_length"
          },
          "slot_index": 7
        }
      ],
      "outputs": [
@ -979,12 +942,14 @@
        720,
        16,
        8,
-        50,
+        25,
        9,
-        12,
+        13,
        "fixed",
-        "DPM",
+        "DDIM",
-        0.81
+        32,
        2,
        0.8
      ]
    }
  ],
@ -1037,14 +1002,6 @@
      0,
      "IMAGE"
    ],
    [
      128,
      45,
      0,
      41,
      0,
      "IMAGE"
    ],
    [
      129,
      57,
@ -1166,35 +1123,43 @@
      "LATENT"
    ],
    [
-      173,
+      177,
-      57,
+      69,
      3,
      67,
      0,
-      "INT,FLOAT"
+      45,
      2,
      "INT"
    ],
    [
-      174,
+      178,
-      67,
+      57,
-      0,
+      3,
      64,
      6,
      "INT"
    ],
    [
-      175,
+      179,
-      69,
+      45,
      0,
-      68,
+      70,
      0,
-      "INT,FLOAT"
+      "IMAGE"
    ],
    [
-      176,
+      180,
-      68,
+      70,
      0,
-      45,
+      41,
-      2,
+      0,
      "IMAGE"
    ],
    [
      181,
      57,
      3,
      64,
      7,
      "INT"
    ]
  ],
@ -1204,8 +1169,8 @@
    "ds": {
      "scale": 0.7513148009015777,
      "offset": [
-        281.39770788130244,
+        177.74090581831425,
-        559.6153930987157
+        461.56507330501444
      ]
    }
  },
--- a/examples/example_01.json
+++ b/examples/example_01.json
@ -2,77 +2,6 @@
  "last_node_id": 31,
  "last_link_id": 57,
  "nodes": [
    {
      "id": 22,
      "type": "CogVideoSampler",
      "pos": [
        1041,
        342
      ],
      "size": {
        "0": 315,
        "1": 334
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 36
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 55,
          "slot_index": 1
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 57
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            37
          ],
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            38
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoSampler"
      },
      "widgets_values": [
        480,
        720,
        16,
        8,
        25,
        6,
        806286757407561,
        "fixed",
        "DDIM",
        1
      ]
    },
    {
      "id": 28,
      "type": "VHS_VideoCombine",
@ -82,7 +11,7 @@
      ],
      "size": [
        667.752197265625,
-        755.8347981770833
+        310
      ],
      "flags": {},
      "order": 6,
@ -292,8 +221,8 @@
      "id": 11,
      "type": "CogVideoDecode",
      "pos": [
-        1138,
+        1140,
-        725
+        783
      ],
      "size": {
        "0": 210,
@ -328,6 +257,79 @@
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      }
    },
    {
      "id": 22,
      "type": "CogVideoSampler",
      "pos": [
        1041,
        342
      ],
      "size": {
        "0": 315,
        "1": 382
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 36
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 55,
          "slot_index": 1
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 57
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            37
          ],
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            38
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoSampler"
      },
      "widgets_values": [
        480,
        720,
        16,
        8,
        25,
        6,
        806286757407561,
        "fixed",
        "DDIM",
        16,
        2,
        1
      ]
    }
  ],
  "links": [
@ -400,10 +402,10 @@
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.9090909090909092,
+      "scale": 0.8264462809917356,
      "offset": [
-        12.99028921497383,
+        253.92700064075518,
-        38.21608107136124
+        186.82608107136124
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -153,17 +153,17 @@ class CogVideoImageEncode:
        vae = pipeline["pipe"].vae
        vae.to(device)
-        image = image * 2.0 - 1.0
+        input_image = image.clone() * 2.0 - 1.0
-        image = image.to(vae.dtype).to(device)
+        input_image = input_image.to(vae.dtype).to(device)
-        image = image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
+        input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
-        B, C, T, H, W = image.shape
+        B, C, T, H, W = input_image.shape
        chunk_size = 16
        latents_list = []
        # Loop through the temporal dimension in chunks of 16
        for i in range(0, T, chunk_size):
            # Get the chunk of 16 frames (or remaining frames if less than 16 are left)
            end_index = min(i + chunk_size, T)
-            image_chunk = image[:, :, i:end_index, :, :]  # Shape: [B, C, chunk_size, H, W]
+            image_chunk = input_image[:, :, i:end_index, :, :]  # Shape: [B, C, chunk_size, H, W]
            # Encode the chunk of images
            latents = vae.encode(image_chunk)
@ -179,6 +179,7 @@ class CogVideoImageEncode:
            latents = vae.config.scaling_factor * latents
            latents = latents.permute(0, 2, 1, 3, 4)  # B, T_chunk, C, H, W
            latents_list.append(latents)
        vae.clear_fake_context_parallel_cache()
        # Concatenate all the chunks along the temporal dimension
        final_latents = torch.cat(latents_list, dim=1)
@ -198,12 +199,14 @@ class CogVideoSampler:
                "negative": ("CONDITIONING", ),
                "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
                "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
-                "num_frames": ("INT", {"default": 48, "min": 8, "max": 100, "step": 8}),
+                "num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}),
                "fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
                "steps": ("INT", {"default": 25, "min": 1}),
                "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
                "scheduler": (["DDIM", "DPM"],),
                "t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}),
                "t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}),
            },
            "optional": {
                "samples": ("LATENT", ),
@ -216,14 +219,20 @@ class CogVideoSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"
-    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
+    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
        mm.soft_empty_cache()
        assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
        assert t_tile_length <= num_frames, "t_tile_length must be equal or less than num_frames"
        t_tile_length = t_tile_length // 4
        t_tile_overlap = t_tile_overlap // 4
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        pipe = pipeline["pipe"]
        dtype = pipeline["dtype"]
        base_path = pipeline["base_path"]
-
+        
        pipe.transformer.to(device)
        generator = torch.Generator(device=device).manual_seed(seed)
@ -237,6 +246,8 @@ class CogVideoSampler:
            height = height,
            width = width,
            num_frames = num_frames,
            t_tile_length = t_tile_length,
            t_tile_overlap = t_tile_overlap,
            fps = fps,
            guidance_scale=cfg,
            latents=samples["samples"] if samples is not None else None,
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -218,6 +218,16 @@ class CogVideoXPipeline(DiffusionPipeline):
            self.scheduler.set_begin_index(t_start * self.scheduler.order)
        return timesteps.to(device), num_inference_steps - t_start
    def _gaussian_weights(self, t_tile_length, t_batch_size):
        from numpy import pi, exp, sqrt
        var = 0.01
        midpoint = (t_tile_length - 1) / 2  # -1 because index goes from 0 to latent_width - 1
        t_probs = [exp(-(t-midpoint)*(t-midpoint)/(t_tile_length*t_tile_length)/(2*var)) / sqrt(2*pi*var) for t in range(t_tile_length)]
        weights = torch.tensor(t_probs)
        weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
        return weights
    @property
    def guidance_scale(self):
@ -244,6 +254,8 @@ class CogVideoXPipeline(DiffusionPipeline):
        height: int = 480,
        width: int = 720,
        num_frames: int = 48,
        t_tile_length: int = 12,
        t_tile_overlap: int = 4,
        fps: int = 8,
        num_inference_steps: int = 50,
        timesteps: Optional[List[int]] = None,
@ -301,9 +313,9 @@ class CogVideoXPipeline(DiffusionPipeline):
                argument.
        """
-        assert (
+        #assert (
-            num_frames <= 48 and num_frames % fps == 0 and fps == 8
+        #    num_frames <= 48 and num_frames % fps == 0 and fps == 8
-        ), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
+        #), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
        height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
        width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
@ -337,7 +349,10 @@ class CogVideoXPipeline(DiffusionPipeline):
        # 5. Prepare latents.
        latent_channels = self.transformer.config.in_channels
-        num_frames += 1
+
        if latents is None and num_frames == t_tile_length:
            num_frames += 1
        latents, timesteps = self.prepare_latents(
            batch_size * num_videos_per_prompt,
            latent_channels,
@ -356,6 +371,9 @@ class CogVideoXPipeline(DiffusionPipeline):
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
        t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
        print("latents.shape", latents.shape)
        print("latents.device", latents.device)
        # 7. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
        comfy_pbar = ProgressBar(num_inference_steps)
@ -365,45 +383,90 @@ class CogVideoXPipeline(DiffusionPipeline):
            for i, t in enumerate(timesteps):
                if self.interrupt:
                    continue
                #temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
                # =====================================================
                grid_ts = 0
                cur_t = 0
                while cur_t < latents.shape[1]:
                    cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length
                    grid_ts += 1
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                all_t = latents.shape[1]
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latents_all_list = []
                # =====================================================
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                for t_i in range(grid_ts):
-                timestep = t.expand(latent_model_input.shape[0])
+                    if t_i < grid_ts - 1:
                        ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
                    if t_i == grid_ts - 1:
                        ofs_t = all_t - t_tile_length
-                # predict noise model_output
+                    input_start_t = ofs_t
-                noise_pred = self.transformer(
+                    input_end_t = ofs_t + t_tile_length
                    hidden_states=latent_model_input,
                    encoder_hidden_states=prompt_embeds,
                    timestep=timestep,
                    return_dict=False,
                )[0]
                noise_pred = noise_pred.float()
-                # perform guidance
+                    #latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                # self._guidance_scale = 1 + guidance_scale * (
+                    #latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                #     (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
                # )
                # print(self._guidance_scale)
                if self.do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-                # compute the previous noisy sample x_t -> x_t-1
+                    latents_tile = latents[:, input_start_t:input_end_t,:, :, :]
-                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                    latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t)
-                else:
+
-                    latents, old_pred_original_sample = self.scheduler.step(
+                    #t_input = t[None].to(device)
-                        noise_pred,
+                    t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                        old_pred_original_sample,
+            
-                        t,
+                    # predict noise model_output
-                        timesteps[i - 1] if i > 0 else None,
+                    noise_pred = self.transformer(
-                        latents,
+                        hidden_states=latent_model_input_tile,
-                        **extra_step_kwargs,
+                        encoder_hidden_states=prompt_embeds,
                        timestep=t_input,
                        return_dict=False,
-                    )
+                    )[0]
-                latents = latents.to(prompt_embeds.dtype)
+                    noise_pred = noise_pred.float()                  
                    if self.do_classifier_free_guidance:
                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
                    # compute the previous noisy sample x_t -> x_t-1
                    if not isinstance(self.scheduler, CogVideoXDPMScheduler):
                        latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
                    else:
                        raise NotImplementedError("DPM is not supported with temporal tiling")
                    # else:
                    #     latents_tile, old_pred_original_sample = self.scheduler.step(
                    #         noise_pred,
                    #         old_pred_original_sample,
                    #         t,
                    #         t_input[t_i - 1] if t_i > 0 else None,
                    #         latents_tile,
                    #         **extra_step_kwargs,
                    #         return_dict=False,
                    #     )
                    latents_all_list.append(latents_tile)
                # ==========================================
                latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
                contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
                # Add each tile contribution to overall latents
                for t_i in range(grid_ts):
                    if t_i < grid_ts - 1:
                        ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
                    if t_i == grid_ts - 1:
                        ofs_t = all_t - t_tile_length
                    input_start_t = ofs_t
                    input_end_t = ofs_t + t_tile_length
                    latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights
                    contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights
                latents_all /= contributors
                latents = latents_all
                # ==========================================
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()