cleanup code, update examples, fix fp8 on the 2b model and with DDIM

2026-03-16 15:47:23 +08:00 · 2024-09-03 17:15:06 +03:00 · 2024-09-03 17:15:06 +03:00 · f836f2c24d
commit f836f2c24d
parent ffb9aac826
5 changed files with 850 additions and 564 deletions
--- a/examples/cogvideo_2b_temporal_tiling_long_01.json
+++ b/examples/cogvideo_2b_temporal_tiling_long_01.json
@ -5,10 +5,18 @@
    {
      "id": 30,
      "type": "CogVideoTextEncode",
-      "pos": [
-        500,
-        308
-      ],
+      "pos": {
+        "0": 500,
+        "1": 308,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 474.8450012207031,
        "1": 164.7423553466797
@ -44,10 +52,18 @@
    {
      "id": 20,
      "type": "CLIPLoader",
-      "pos": [
-        -59,
-        397
-      ],
+      "pos": {
+        "0": -59,
+        "1": 397,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 451.30548095703125,
        "1": 82
@ -55,6 +71,7 @@
      "flags": {},
      "order": 0,
      "mode": 0,
+      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
@ -78,10 +95,18 @@
    {
      "id": 31,
      "type": "CogVideoTextEncode",
-      "pos": [
-        503,
-        521
-      ],
+      "pos": {
+        "0": 503,
+        "1": 521,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 463.01251220703125,
        "1": 98.10446166992188
@ -115,110 +140,144 @@
      ]
    },
    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1140,
-        783
-      ],
-      "size": {
-        "0": 210,
-        "1": 78
+      "id": 32,
+      "type": "VHS_VideoCombine",
+      "pos": {
+        "0": 1439,
+        "1": 122,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
      },
+      "size": [
+        563.3333740234375,
+        688.2124814882384
+      ],
      "flags": {},
-      "order": 5,
+      "order": 7,
      "mode": 0,
      "inputs": [
        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 37
+          "name": "images",
+          "type": "IMAGE",
+          "link": 60,
+          "slot_index": 0
        },
        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 38
+          "name": "audio",
+          "type": "VHS_AUDIO",
+          "link": null
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null
        }
      ],
      "outputs": [
        {
-          "name": "images",
-          "type": "IMAGE",
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 8,
+        "loop_count": 0,
+        "filename_prefix": "CogVideo2B_long",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": false,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "CogVideo2B_long_00001.mp4",
+            "subfolder": "",
+            "type": "temp",
+            "format": "video/h264-mp4",
+            "frame_rate": 8
+          }
+        }
+      }
+    },
+    {
+      "id": 1,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": {
+        "0": 653,
+        "1": 90,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 315,
+        "1": 154
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
          "links": [
-            59
+            36
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "CogVideoDecode"
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
      },
      "widgets_values": [
+        "THUDM/CogVideoX-2b",
+        "fp16",
+        "disabled",
+        "disabled",
        false
      ]
    },
-    {
-      "id": 33,
-      "type": "GetImageSizeAndCount",
-      "pos": [
-        1189,
-        134
-      ],
-      "size": {
-        "0": 210,
-        "1": 86
-      },
-      "flags": {},
-      "order": 6,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 59
-        }
-      ],
-      "outputs": [
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "links": [
-            60
-          ],
-          "slot_index": 0,
-          "shape": 3
-        },
-        {
-          "name": "720 width",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "480 height",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "32 count",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "GetImageSizeAndCount"
-      }
-    },
    {
      "id": 22,
      "type": "CogVideoSampler",
-      "pos": [
-        1041,
-        342
-      ],
+      "pos": {
+        "0": 1041,
+        "1": 342,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 315,
        "1": 382
@ -278,114 +337,129 @@
        6,
        6,
        806286757407563,
-        "DDIM",
+        "DDIM_tiled",
        48,
        8,
        1
      ]
    },
    {
-      "id": 1,
-      "type": "DownloadAndLoadCogVideoModel",
-      "pos": [
-        649,
-        182
-      ],
-      "size": {
-        "0": 315,
-        "1": 82
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": {
+        "0": 1049,
+        "1": 772,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
      },
+      "size": [
+        295.70112532900725,
+        198
+      ],
      "flags": {},
-      "order": 1,
+      "order": 5,
      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 37
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 38
+        }
+      ],
      "outputs": [
        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
+          "name": "images",
+          "type": "IMAGE",
          "links": [
-            36
+            59
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+        "Node name for S&R": "CogVideoDecode"
      },
      "widgets_values": [
-        "THUDM/CogVideoX-2b",
-        "fp16"
+        true,
+        96,
+        96,
+        0.083,
+        0.083,
+        true
      ]
    },
    {
-      "id": 32,
-      "type": "VHS_VideoCombine",
-      "pos": [
-        1439,
-        122
-      ],
-      "size": [
-        563.3333740234375,
-        310
-      ],
+      "id": 33,
+      "type": "GetImageSizeAndCount",
+      "pos": {
+        "0": 1176,
+        "1": 122,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 210,
+        "1": 86
+      },
      "flags": {},
-      "order": 7,
+      "order": 6,
      "mode": 0,
      "inputs": [
        {
-          "name": "images",
+          "name": "image",
          "type": "IMAGE",
-          "link": 60,
-          "slot_index": 0
-        },
-        {
-          "name": "audio",
-          "type": "VHS_AUDIO",
-          "link": null
-        },
-        {
-          "name": "meta_batch",
-          "type": "VHS_BatchManager",
-          "link": null
-        },
-        {
-          "name": "vae",
-          "type": "VAE",
-          "link": null
+          "link": 59
        }
      ],
      "outputs": [
        {
-          "name": "Filenames",
-          "type": "VHS_FILENAMES",
+          "name": "image",
+          "type": "IMAGE",
+          "links": [
+            60
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "728 width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "485 height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "96 count",
+          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "VHS_VideoCombine"
-      },
-      "widgets_values": {
-        "frame_rate": 8,
-        "loop_count": 0,
-        "filename_prefix": "CogVideo2B_long",
-        "format": "video/h264-mp4",
-        "pix_fmt": "yuv420p",
-        "crf": 19,
-        "save_metadata": true,
-        "pingpong": false,
-        "save_output": false,
-        "videopreview": {
-          "hidden": false,
-          "paused": false,
-          "params": {
-            "filename": "AnimateDiff_00001.mp4",
-            "subfolder": "",
-            "type": "temp",
-            "format": "video/h264-mp4",
-            "frame_rate": 8
-          }
-        }
+        "Node name for S&R": "GetImageSizeAndCount"
      }
    }
  ],
@ -467,10 +541,10 @@
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.8264462809917354,
+      "scale": 0.7513148009015777,
      "offset": [
-        86.92928825501215,
-        77.5537144406024
+        253.3863163213836,
+        255.76127216744268
      ]
    }
  },
--- a/examples/cogvideo_2b_vid2vid_test_example_02.json
+++ b/examples/cogvideo_2b_vid2vid_test_example_02.json
@ -5,10 +5,18 @@
    {
      "id": 20,
      "type": "CLIPLoader",
-      "pos": [
-        -29,
-        407
-      ],
+      "pos": {
+        "0": -29,
+        "1": 407,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 451.30548095703125,
        "1": 82
@ -16,6 +24,7 @@
      "flags": {},
      "order": 0,
      "mode": 0,
+      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
@ -39,10 +48,18 @@
    {
      "id": 31,
      "type": "CogVideoTextEncode",
-      "pos": [
-        503,
-        521
-      ],
+      "pos": {
+        "0": 503,
+        "1": 521,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 463.01251220703125,
        "1": 98.10446166992188
@ -78,10 +95,18 @@
    {
      "id": 41,
      "type": "ImageResizeKJ",
-      "pos": [
-        206,
-        -69
-      ],
+      "pos": {
+        "0": 206,
+        "1": -69,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 315,
        "1": 242
@ -154,100 +179,21 @@
        "disabled"
      ]
    },
-    {
-      "id": 37,
-      "type": "CogVideoImageEncode",
-      "pos": [
-        939,
-        -53
-      ],
-      "size": {
-        "0": 210,
-        "1": 46
-      },
-      "flags": {},
-      "order": 9,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 83,
-          "slot_index": 0
-        },
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 129,
-          "slot_index": 1
-        }
-      ],
-      "outputs": [
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "links": [
-            172
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoImageEncode"
-      }
-    },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1224,
-        737
-      ],
-      "size": {
-        "0": 210,
-        "1": 78
-      },
-      "flags": {},
-      "order": 11,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 166
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 167
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            118
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      },
-      "widgets_values": [
-        false
-      ]
-    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
-      "pos": [
-        500,
-        308
-      ],
+      "pos": {
+        "0": 500,
+        "1": 308,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 474.8450012207031,
        "1": 164.7423553466797
@ -283,10 +229,18 @@
    {
      "id": 57,
      "type": "GetImageSizeAndCount",
-      "pos": [
-        603,
-        -65
-      ],
+      "pos": {
+        "0": 603,
+        "1": -65,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 202.2143096923828,
        "1": 99.23601531982422
@ -332,7 +286,7 @@
          "shape": 3
        },
        {
-          "name": "32 count",
+          "name": "33 count",
          "type": "INT",
          "links": [
            178,
@ -349,10 +303,18 @@
    {
      "id": 45,
      "type": "VHS_LoadVideo",
-      "pos": [
-        -93,
-        -153
-      ],
+      "pos": {
+        "0": -93,
+        "1": -153,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": [
        235.1999969482422,
        359.5999984741211
@ -440,10 +402,18 @@
    {
      "id": 70,
      "type": "GetImageSizeAndCount",
-      "pos": [
-        214,
-        -234
-      ],
+      "pos": {
+        "0": 214,
+        "1": -234,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 202.2143096923828,
        "1": 99.23601531982422
@ -484,7 +454,7 @@
          "shape": 3
        },
        {
-          "name": "32 count",
+          "name": "33 count",
          "type": "INT",
          "links": [],
          "slot_index": 3,
@ -498,10 +468,18 @@
    {
      "id": 69,
      "type": "INTConstant",
-      "pos": [
-        -90,
-        -305
-      ],
+      "pos": {
+        "0": -90,
+        "1": -305,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 210,
        "1": 58
@ -509,6 +487,7 @@
      "flags": {},
      "order": 1,
      "mode": 0,
+      "inputs": [],
      "outputs": [
        {
          "name": "value",
@ -529,13 +508,145 @@
      "color": "#1b4669",
      "bgcolor": "#29699c"
    },
+    {
+      "id": 47,
+      "type": "VHS_VideoCombine",
+      "pos": {
+        "0": 1560,
+        "1": -379,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": [
+        1110,
+        310
+      ],
+      "flags": {},
+      "order": 14,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 132
+        },
+        {
+          "name": "audio",
+          "type": "VHS_AUDIO",
+          "link": null
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 8,
+        "loop_count": 0,
+        "filename_prefix": "CogVideoX_vid2vid",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": false,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "AnimateDiff_00001.mp4",
+            "subfolder": "",
+            "type": "temp",
+            "format": "video/h264-mp4",
+            "frame_rate": 8
+          }
+        }
+      }
+    },
+    {
+      "id": 1,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": {
+        "0": 606,
+        "1": 85,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 315,
+        "1": 154
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            83,
+            159
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "THUDM/CogVideoX-5b",
+        "fp16",
+        "disabled",
+        "disabled",
+        false
+      ]
+    },
    {
      "id": 64,
      "type": "CogVideoSampler",
-      "pos": [
-        1090,
-        290
-      ],
+      "pos": {
+        "0": 1090,
+        "1": 290,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 312.9841613769531,
        "1": 342.8801574707031
@ -627,98 +738,144 @@
        6,
        9,
        "fixed",
-        "DDIM",
+        "DPM",
        "DDIM",
        8,
        0.85
      ]
    },
    {
-      "id": 1,
-      "type": "DownloadAndLoadCogVideoModel",
-      "pos": [
-        649,
-        182
-      ],
-      "size": {
-        "0": 315,
-        "1": 82
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": {
+        "0": 1097,
+        "1": 681,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
      },
+      "size": [
+        301.1664045038119,
+        198
+      ],
      "flags": {},
-      "order": 2,
-      "mode": 0,
-      "outputs": [
-        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
-          "links": [
-            83,
-            159
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "DownloadAndLoadCogVideoModel"
-      },
-      "widgets_values": [
-        "THUDM/CogVideoX-2b",
-        "fp16"
-      ]
-    },
-    {
-      "id": 58,
-      "type": "ImageConcanate",
-      "pos": [
-        1499,
-        433
-      ],
-      "size": {
-        "0": 315,
-        "1": 102
-      },
-      "flags": {},
-      "order": 13,
+      "order": 11,
      "mode": 0,
      "inputs": [
        {
-          "name": "image1",
-          "type": "IMAGE",
-          "link": 191
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 166
        },
        {
-          "name": "image2",
-          "type": "IMAGE",
-          "link": 170
+          "name": "samples",
+          "type": "LATENT",
+          "link": 167
        }
      ],
      "outputs": [
        {
-          "name": "IMAGE",
+          "name": "images",
          "type": "IMAGE",
          "links": [
-            132
+            118
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "ImageConcanate"
+        "Node name for S&R": "CogVideoDecode"
      },
      "widgets_values": [
-        "right",
-        false
+        false,
+        96,
+        96,
+        0.083,
+        0.083,
+        true
+      ]
+    },
+    {
+      "id": 37,
+      "type": "CogVideoImageEncode",
+      "pos": {
+        "0": 975,
+        "1": -73,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 210,
+        "1": 122
+      },
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 83,
+          "slot_index": 0
+        },
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 129,
+          "slot_index": 1
+        },
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            172
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoImageEncode"
+      },
+      "widgets_values": [
+        8,
+        true
      ]
    },
    {
      "id": 55,
      "type": "GetImageSizeAndCount",
-      "pos": [
-        1223,
-        122
-      ],
+      "pos": {
+        "0": 1195,
+        "1": 154,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 210,
        "1": 86
@ -769,75 +926,57 @@
      }
    },
    {
-      "id": 47,
-      "type": "VHS_VideoCombine",
-      "pos": [
-        1560,
-        -379
-      ],
-      "size": [
-        1110,
-        711.3333333333333
-      ],
+      "id": 58,
+      "type": "ImageConcanate",
+      "pos": {
+        "0": 1434,
+        "1": 289,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 315,
+        "1": 102
+      },
      "flags": {},
-      "order": 14,
+      "order": 13,
      "mode": 0,
      "inputs": [
        {
-          "name": "images",
+          "name": "image1",
          "type": "IMAGE",
-          "link": 132
+          "link": 191
        },
        {
-          "name": "audio",
-          "type": "VHS_AUDIO",
-          "link": null
-        },
-        {
-          "name": "meta_batch",
-          "type": "VHS_BatchManager",
-          "link": null
-        },
-        {
-          "name": "vae",
-          "type": "VAE",
-          "link": null
+          "name": "image2",
+          "type": "IMAGE",
+          "link": 170
        }
      ],
      "outputs": [
        {
-          "name": "Filenames",
-          "type": "VHS_FILENAMES",
-          "links": null,
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            132
+          ],
+          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
-        "Node name for S&R": "VHS_VideoCombine"
+        "Node name for S&R": "ImageConcanate"
      },
-      "widgets_values": {
-        "frame_rate": 8,
-        "loop_count": 0,
-        "filename_prefix": "CogVideoX_vid2vid",
-        "format": "video/h264-mp4",
-        "pix_fmt": "yuv420p",
-        "bitrate": 10,
-        "megabit": true,
-        "save_metadata": true,
-        "pingpong": false,
-        "save_output": false,
-        "videopreview": {
-          "hidden": false,
-          "paused": false,
-          "params": {
-            "filename": "AnimateDiff_00001.mp4",
-            "subfolder": "",
-            "type": "temp",
-            "format": "video/h264-mp4",
-            "frame_rate": 8
-          }
-        }
-      }
+      "widgets_values": [
+        "right",
+        false
+      ]
    }
  ],
  "links": [
@ -1022,10 +1161,10 @@
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.620921323059155,
+      "scale": 0.7513148009015777,
      "offset": [
-        298.59028824596885,
-        694.562497939138
+        280.8935954961883,
+        403.945992992638
      ]
    }
  },
--- a/examples/cogvideox_5b_example_01.json
+++ b/examples/cogvideox_5b_example_01.json
@ -5,10 +5,18 @@
    {
      "id": 31,
      "type": "CogVideoTextEncode",
-      "pos": [
-        503,
-        521
-      ],
+      "pos": {
+        "0": 503,
+        "1": 521,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 463.01251220703125,
        "1": 98.10446166992188
@ -41,94 +49,25 @@
        ""
      ]
    },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1140,
-        783
-      ],
-      "size": {
-        "0": 210,
-        "1": 78
-      },
-      "flags": {},
-      "order": 5,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 63
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 64
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            59
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      },
-      "widgets_values": [
-        false
-      ]
-    },
-    {
-      "id": 1,
-      "type": "DownloadAndLoadCogVideoModel",
-      "pos": [
-        649,
-        182
-      ],
-      "size": {
-        "0": 315,
-        "1": 82
-      },
-      "flags": {},
-      "order": 0,
-      "mode": 0,
-      "outputs": [
-        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
-          "links": [
-            60
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "DownloadAndLoadCogVideoModel"
-      },
-      "widgets_values": [
-        "THUDM/CogVideoX-5b",
-        "bf16"
-      ]
-    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
-      "pos": [
-        500,
-        308
-      ],
-      "size": [
-        471.90143257018326,
-        168.0804709842023
-      ],
+      "pos": {
+        "0": 500,
+        "1": 308,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 471.90142822265625,
+        "1": 168.08047485351562
+      },
      "flags": {},
      "order": 2,
      "mode": 0,
@ -160,14 +99,22 @@
    {
      "id": 34,
      "type": "CogVideoSampler",
-      "pos": [
-        1041,
-        342
-      ],
-      "size": [
-        315.84047081854465,
-        358
-      ],
+      "pos": {
+        "0": 1041,
+        "1": 342,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 315.8404846191406,
+        "1": 358
+      },
      "flags": {},
      "order": 4,
      "mode": 0,
@ -231,13 +178,21 @@
    {
      "id": 33,
      "type": "VHS_VideoCombine",
-      "pos": [
-        1441,
-        129
-      ],
+      "pos": {
+        "0": 1441,
+        "1": 129,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": [
        778.7022705078125,
-        853.801513671875
+        310
      ],
      "flags": {},
      "order": 6,
@ -281,8 +236,7 @@
        "filename_prefix": "CogVideoX5B",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
-        "bitrate": 10,
-        "megabit": true,
+        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
        "save_output": false,
@ -303,17 +257,26 @@
    {
      "id": 20,
      "type": "CLIPLoader",
-      "pos": [
-        -26,
-        400
-      ],
+      "pos": {
+        "0": -26,
+        "1": 400,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
      "size": {
        "0": 451.30548095703125,
        "1": 82
      },
      "flags": {},
-      "order": 1,
+      "order": 0,
      "mode": 0,
+      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
@ -333,6 +296,108 @@
        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
        "sd3"
      ]
+    },
+    {
+      "id": 1,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": {
+        "0": 642,
+        "1": 90,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": {
+        "0": 315,
+        "1": 154
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            60
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "THUDM/CogVideoX-5b",
+        "bf16",
+        "disabled",
+        "disabled",
+        false
+      ]
+    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": {
+        "0": 1051,
+        "1": 748,
+        "2": 0,
+        "3": 0,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0,
+        "8": 0,
+        "9": 0
+      },
+      "size": [
+        300.3964783563508,
+        198
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 63
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 64
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            59
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      },
+      "widgets_values": [
+        false,
+        96,
+        96,
+        0.083,
+        0.083,
+        true
+      ]
    }
  ],
  "links": [
@ -405,10 +470,10 @@
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.7513148009015777,
+      "scale": 0.8264462809917354,
      "offset": [
-        209.1392882550122,
-        105.74671444060245
+        161.910286780368,
+        124.7586178095323
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -48,10 +48,6 @@ class DownloadAndLoadCogVideoModel:
        mm.soft_empty_cache()

        dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
-        if fp8_transformer != "disabled":
-            transformer_dtype = torch.float8_e4m3fn
-        else:
-            transformer_dtype = dtype

        if "2b" in model:
            base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B")
@ -68,12 +64,15 @@ class DownloadAndLoadCogVideoModel:
                local_dir=base_path,
                local_dir_use_symlinks=False,
            )
-        transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(transformer_dtype).to(offload_device)
-        if fp8_transformer == "fastmode":
-            from .fp8_optimization import convert_fp8_linear
-            convert_fp8_linear(transformer, dtype)
+        if fp8_transformer == "enabled" or fp8_transformer == "fastmode":
+            transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(torch.float8_e4m3fn).to(offload_device)
+            if fp8_transformer == "fastmode":
+                from .fp8_optimization import convert_fp8_linear
+                convert_fp8_linear(transformer, dtype)
+        else:
+            transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(dtype).to(offload_device)
+
        vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
-        
        scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")

        pipe = CogVideoXPipeline(vae, transformer, scheduler)
@ -95,8 +94,6 @@ class DownloadAndLoadCogVideoModel:
            fuse_qkv_projections=True,
            )

-        
-
        pipeline = {
            "pipe": pipe,
            "dtype": dtype,
@ -215,6 +212,8 @@ class CogVideoImageEncode:
            # mask = mask.unsqueeze(-1).repeat(1, 1, 1, C)
            # print(mask.shape)
            # input_image = input_image * (1 -mask)
+        else:
+            pipeline["pipe"].original_mask = None
            
        input_image = input_image * 2.0 - 1.0
        input_image = input_image.to(vae.dtype).to(device)
@ -265,7 +264,7 @@ class CogVideoSampler:
                "steps": ("INT", {"default": 50, "min": 1}),
                "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
-                "scheduler": (["DDIM", "DPM"], {"tooltip": "5B likes DPM, but it doesn't support temporal tiling"}),
+                "scheduler": (["DDIM", "DPM", "DDIM_tiled"], {"tooltip": "5B likes DPM, but it doesn't support temporal tiling"}),
                "t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1, "tooltip": "Length of temporal tiling, use same alue as num_frames to disable, disabled automatically for DPM"}),
                "t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1, "tooltip": "Overlap of temporal tiling"}),
            },
@ -298,7 +297,7 @@ class CogVideoSampler:
            pipe.transformer.to(device)
        generator = torch.Generator(device=device).manual_seed(seed)

-        if scheduler == "DDIM":
+        if scheduler == "DDIM" or scheduler == "DDIM_tiled":
            pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
        elif scheduler == "DPM":
            pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
@ -324,7 +323,8 @@ class CogVideoSampler:
                prompt_embeds=positive.to(dtype).to(device),
                negative_prompt_embeds=negative.to(dtype).to(device),
                generator=generator,
-                device=device
+                device=device,
+                scheduler_name=scheduler
            )
        if not pipeline["cpu_offloading"]:
            pipe.transformer.to(offload_device)
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -332,10 +332,11 @@ class CogVideoXPipeline(DiffusionPipeline):
        num_videos_per_prompt: int = 1,
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
        device = torch.device("cuda"),
+        scheduler_name: str = "DPM",
    ):
        """
        Function invoked when calling the pipeline for generation.
@ -421,8 +422,11 @@ class CogVideoXPipeline(DiffusionPipeline):

        if latents is None and num_frames == t_tile_length:
            num_frames += 1
-        image_latents = latents
-        original_image_latents = image_latents
+
+        if self.original_mask is not None:
+            image_latents = latents
+            original_image_latents = image_latents
+
        latents, timesteps, noise = self.prepare_latents(
            batch_size * num_videos_per_prompt,
            latent_channels,
@ -439,15 +443,9 @@ class CogVideoXPipeline(DiffusionPipeline):
        )
        latents = latents.to(self.transformer.dtype)
       
-
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
-        print("latents.shape", latents.shape)
-        print("latents.device", latents.device)
-
-
        # 6.5. Create rotary embeds if required
        image_rotary_emb = (
            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
@ -471,15 +469,23 @@ class CogVideoXPipeline(DiffusionPipeline):
        # 7. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
        comfy_pbar = ProgressBar(num_inference_steps)
+
+        # 8. Temporal tiling prep
+        if "tiled" in scheduler_name:
+            t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(self.vae.dtype)
+            temporal_tiling = True
+            print("Temporal tiling enabled")
+        else:
+            temporal_tiling = False
+            print("Temporal tiling disabled")
+        print("latents.shape", latents.shape)
        
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            
-            # for DPM-solver++
-            old_pred_original_sample = None
+        with self.progress_bar(total=num_inference_steps) as progress_bar:    
+            old_pred_original_sample = None # for DPM-solver++
            for i, t in enumerate(timesteps):
                if self.interrupt:
                    continue
-                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                if temporal_tiling and isinstance(self.scheduler, CogVideoXDDIMScheduler):
                    #temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
                    # =====================================================
                    grid_ts = 0
@ -532,12 +538,12 @@ class CogVideoXPipeline(DiffusionPipeline):
                            noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)

                        # compute the previous noisy sample x_t -> x_t-1
-                        latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]            
+                        latents_tile = self.scheduler.step(noise_pred, t, latents_tile.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0]            
                        latents_all_list.append(latents_tile)

                    # ==========================================
-                    latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
-                    contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
+                    latents_all = torch.zeros(latents.shape, device=latents.device, dtype=self.vae.dtype)
+                    contributors = torch.zeros(latents.shape, device=latents.device, dtype=self.vae.dtype)
                    # Add each tile contribution to overall latents
                    for t_i in range(grid_ts):
                        if t_i < grid_ts - 1:
@ -573,7 +579,6 @@ class CogVideoXPipeline(DiffusionPipeline):
                        comfy_pbar.update(1)
                    # ==========================================
                else:
-                    
                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

@ -590,25 +595,28 @@ class CogVideoXPipeline(DiffusionPipeline):
                    )[0]
                    noise_pred = noise_pred.float()

-                   
-                    self._guidance_scale = 1 + guidance_scale * (
-                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
-                    )
+                    if isinstance(self.scheduler, CogVideoXDPMScheduler):
+                        self._guidance_scale = 1 + guidance_scale * (
+                            (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                        )
                    
                    if do_classifier_free_guidance:
                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                        noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)

                    # compute the previous noisy sample x_t -> x_t-1
-                    latents, old_pred_original_sample = self.scheduler.step(
-                        noise_pred,
-                        old_pred_original_sample,
-                        t,
-                        timesteps[i - 1] if i > 0 else None,
-                        latents.to(self.vae.dtype),
-                        **extra_step_kwargs,
-                        return_dict=False,
-                    )
+                    if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                        latents = self.scheduler.step(noise_pred, t, latents.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0]
+                    else:
+                        latents, old_pred_original_sample = self.scheduler.step(
+                            noise_pred,
+                            old_pred_original_sample,
+                            t,
+                            timesteps[i - 1] if i > 0 else None,
+                            latents.to(self.vae.dtype),
+                            **extra_step_kwargs,
+                            return_dict=False,
+                        )
                    # start diff diff
                    if i < len(timesteps) - 1 and self.original_mask is not None:
                        noise_timestep = timesteps[i + 1]