Refactor Fun sampler to be easier to use with Tora (breaks old workflows!)

The FunSampler node in old workflows needs to be remade. I moved the forced bucket resize to it's own node if anyone still wants to use that.
2026-01-23 10:24:24 +08:00 · 2024-11-07 13:01:34 +02:00 · 2024-11-07 13:01:34 +02:00 · 9202921920
commit 9202921920
parent 666f7832f9
4 changed files with 2036 additions and 611 deletions
--- a/examples/cogvideox_fun_img2vid_tora_01.json
+++ b/examples/cogvideox_fun_img2vid_tora_01.json
--- a/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_02.json
+++ b/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_02.json
@ -1,6 +1,6 @@
 {
-  "last_node_id": 48,
-  "last_link_id": 101,
+  "last_node_id": 51,
+  "last_link_id": 114,
  "nodes": [
    {
      "id": 20,
@ -22,8 +22,7 @@
          "name": "CLIP",
          "type": "CLIP",
          "links": [
-            54,
-            56
+            54
          ],
          "slot_index": 0,
          "shape": 3
@ -46,16 +45,16 @@
      },
      "size": {
        "0": 463.01251220703125,
-        "1": 124
+        "1": 144
      },
      "flags": {},
-      "order": 4,
+      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
-          "link": 56
+          "link": 108
        }
      ],
      "outputs": [
@ -63,10 +62,15 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            86
+            111
          ],
          "slot_index": 0,
          "shape": 3
+        },
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "links": null
        }
      ],
      "properties": {
@ -87,7 +91,7 @@
      },
      "size": [
        855.81494140625,
-        927.6441243489584
+        881.2099609375
      ],
      "flags": {},
      "order": 8,
@ -101,17 +105,20 @@
        {
          "name": "audio",
          "type": "AUDIO",
-          "link": null
+          "link": null,
+          "shape": 7
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
-          "link": null
+          "link": null,
+          "shape": 7
        },
        {
          "name": "vae",
          "type": "VAE",
-          "link": null
+          "link": null,
+          "shape": 7
        }
      ],
      "outputs": [
@ -139,7 +146,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "CogVideoX_Fun_00012.mp4",
+            "filename": "CogVideoX_Fun_00003.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
@ -149,61 +156,12 @@
        }
      }
    },
-    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": {
-        "0": 1448,
-        "1": 345
-      },
-      "size": {
-        "0": 300.396484375,
-        "1": 198
-      },
-      "flags": {},
-      "order": 7,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 89
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 88
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            97
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      },
-      "widgets_values": [
-        true,
-        240,
-        360,
-        0.2,
-        0.2,
-        true
-      ]
-    },
    {
      "id": 36,
      "type": "LoadImage",
      "pos": {
-        "0": 364,
-        "1": 715
+        "0": 227,
+        "1": 700
      },
      "size": {
        "0": 391.3421325683594,
@ -242,15 +200,15 @@
      "id": 37,
      "type": "ImageResizeKJ",
      "pos": {
-        "0": 824,
-        "1": 715
+        "0": 688,
+        "1": 708
      },
      "size": {
        "0": 315,
        "1": 266
      },
      "flags": {},
-      "order": 5,
+      "order": 4,
      "mode": 0,
      "inputs": [
        {
@ -261,7 +219,8 @@
        {
          "name": "get_image_size",
          "type": "IMAGE",
-          "link": null
+          "link": null,
+          "shape": 7
        },
        {
          "name": "width_input",
@ -285,7 +244,7 @@
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
-            87
+            112
          ],
          "slot_index": 0,
          "shape": 3
@ -317,6 +276,55 @@
        "disabled"
      ]
    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": {
+        "0": 1477,
+        "1": 344
+      },
+      "size": {
+        "0": 300.396484375,
+        "1": 198
+      },
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 113
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 114
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            97
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      },
+      "widgets_values": [
+        true,
+        240,
+        360,
+        0.2,
+        0.2,
+        true
+      ]
+    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
@ -343,10 +351,18 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            85
+            110
          ],
          "slot_index": 0,
          "shape": 3
+        },
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "links": [
+            108
+          ],
+          "slot_index": 1
        }
      ],
      "properties": {
@ -355,55 +371,19 @@
      "widgets_values": [
        "majestic stag grazing in a forest and basking in the setting sun",
        1,
-        true
+        false
      ]
    },
    {
-      "id": 48,
-      "type": "DownloadAndLoadCogVideoGGUFModel",
-      "pos": {
-        "0": 584,
-        "1": 103
-      },
-      "size": {
-        "0": 378,
-        "1": 130
-      },
-      "flags": {},
-      "order": 2,
-      "mode": 0,
-      "inputs": [],
-      "outputs": [
-        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
-          "links": [
-            101
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
-      },
-      "widgets_values": [
-        "CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
-        "bf16",
-        false,
-        "offload_device"
-      ]
-    },
-    {
-      "id": 41,
+      "id": 51,
      "type": "CogVideoXFunSampler",
      "pos": {
        "0": 1058,
        "1": 345
      },
      "size": {
-        "0": 315,
-        "1": 302
+        "0": 367.79998779296875,
+        "1": 434
      },
      "flags": {},
      "order": 6,
@ -412,32 +392,53 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 101
+          "link": 109
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
-          "link": 85
+          "link": 110
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
-          "link": 86
+          "link": 111
        },
        {
          "name": "start_img",
          "type": "IMAGE",
-          "link": 87
+          "link": 112,
+          "shape": 7
        },
        {
          "name": "end_img",
          "type": "IMAGE",
-          "link": null
+          "link": null,
+          "shape": 7
        },
        {
-          "name": "opt_empty_latent",
-          "type": "LATENT",
-          "link": null
+          "name": "context_options",
+          "type": "COGCONTEXT",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "tora_trajectory",
+          "type": "TORAFEATURES",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "fastercache",
+          "type": "FASTERCACHEARGS",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "vid2vid_images",
+          "type": "IMAGE",
+          "link": null,
+          "shape": 7
        }
      ],
      "outputs": [
@ -445,18 +446,15 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            89
-          ],
-          "slot_index": 0,
-          "shape": 3
+            113
+          ]
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
-            88
-          ],
-          "shape": 3
+            114
+          ]
        }
      ],
      "properties": {
@ -464,12 +462,66 @@
      },
      "widgets_values": [
        49,
-        512,
-        44,
-        "fixed",
-        30,
+        720,
+        480,
+        43,
+        "randomize",
+        50,
        6,
-        "CogVideoXDPMScheduler"
+        "DDIM",
+        0.0563,
+        1
+      ]
+    },
+    {
+      "id": 48,
+      "type": "DownloadAndLoadCogVideoGGUFModel",
+      "pos": {
+        "0": 585,
+        "1": 34
+      },
+      "size": {
+        "0": 378,
+        "1": 198
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pab_config",
+          "type": "PAB_CONFIG",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "block_edit",
+          "type": "TRANSFORMERBLOCKS",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            109
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
+      },
+      "widgets_values": [
+        "CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors",
+        "bf16",
+        false,
+        "offload_device",
+        false,
+        "disabled"
      ]
    }
  ],
@ -482,14 +534,6 @@
      0,
      "CLIP"
    ],
-    [
-      56,
-      20,
-      0,
-      31,
-      0,
-      "CLIP"
-    ],
    [
      71,
      36,
@ -498,46 +542,6 @@
      0,
      "IMAGE"
    ],
-    [
-      85,
-      30,
-      0,
-      41,
-      1,
-      "CONDITIONING"
-    ],
-    [
-      86,
-      31,
-      0,
-      41,
-      2,
-      "CONDITIONING"
-    ],
-    [
-      87,
-      37,
-      0,
-      41,
-      3,
-      "IMAGE"
-    ],
-    [
-      88,
-      41,
-      1,
-      11,
-      1,
-      "LATENT"
-    ],
-    [
-      89,
-      41,
-      0,
-      11,
-      0,
-      "COGVIDEOPIPE"
-    ],
    [
      97,
      11,
@ -547,22 +551,70 @@
      "IMAGE"
    ],
    [
-      101,
+      108,
+      30,
+      1,
+      31,
+      0,
+      "CLIP"
+    ],
+    [
+      109,
      48,
      0,
-      41,
+      51,
      0,
      "COGVIDEOPIPE"
+    ],
+    [
+      110,
+      30,
+      0,
+      51,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      111,
+      31,
+      0,
+      51,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      112,
+      37,
+      0,
+      51,
+      3,
+      "IMAGE"
+    ],
+    [
+      113,
+      51,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      114,
+      51,
+      1,
+      11,
+      1,
+      "LATENT"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.7627768444385654,
+      "scale": 0.7513148009015784,
      "offset": [
-        62.58315607223924,
-        102.05205752424705
+        724.7448506313632,
+        128.336592104936
      ]
    }
  },
--- a/examples/cogvidex_fun_i2v_example_02.json
+++ b/examples/cogvidex_fun_i2v_example_02.json
@ -1,6 +1,6 @@
 {
-  "last_node_id": 45,
-  "last_link_id": 97,
+  "last_node_id": 47,
+  "last_link_id": 110,
  "nodes": [
    {
      "id": 20,
@ -22,8 +22,7 @@
          "name": "CLIP",
          "type": "CLIP",
          "links": [
-            54,
-            56
+            54
          ],
          "slot_index": 0,
          "shape": 3
@ -37,85 +36,6 @@
        "sd3"
      ]
    },
-    {
-      "id": 37,
-      "type": "ImageResizeKJ",
-      "pos": {
-        "0": 824,
-        "1": 715
-      },
-      "size": {
-        "0": 315,
-        "1": 266
-      },
-      "flags": {},
-      "order": 5,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "image",
-          "type": "IMAGE",
-          "link": 71
-        },
-        {
-          "name": "get_image_size",
-          "type": "IMAGE",
-          "link": null
-        },
-        {
-          "name": "width_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "width_input"
-          }
-        },
-        {
-          "name": "height_input",
-          "type": "INT",
-          "link": null,
-          "widget": {
-            "name": "height_input"
-          }
-        }
-      ],
-      "outputs": [
-        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": [
-            87
-          ],
-          "slot_index": 0,
-          "shape": 3
-        },
-        {
-          "name": "width",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        },
-        {
-          "name": "height",
-          "type": "INT",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "ImageResizeKJ"
-      },
-      "widgets_values": [
-        720,
-        480,
-        "nearest-exact",
-        false,
-        2,
-        0,
-        0,
-        "disabled"
-      ]
-    },
    {
      "id": 11,
      "type": "CogVideoDecode",
@ -134,12 +54,12 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 89
+          "link": 108
        },
        {
          "name": "samples",
          "type": "LATENT",
-          "link": 88
+          "link": 109
        }
      ],
      "outputs": [
@ -165,43 +85,6 @@
        true
      ]
    },
-    {
-      "id": 1,
-      "type": "DownloadAndLoadCogVideoModel",
-      "pos": {
-        "0": 642,
-        "1": 90
-      },
-      "size": {
-        "0": 337.8885192871094,
-        "1": 154
-      },
-      "flags": {},
-      "order": 1,
-      "mode": 0,
-      "inputs": [],
-      "outputs": [
-        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
-          "links": [
-            84
-          ],
-          "slot_index": 0,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "DownloadAndLoadCogVideoModel"
-      },
-      "widgets_values": [
-        "kijai/CogVideoX-Fun-5b",
-        "bf16",
-        "disabled",
-        "disabled",
-        false
-      ]
-    },
    {
      "id": 31,
      "type": "CogVideoTextEncode",
@ -211,16 +94,16 @@
      },
      "size": {
        "0": 463.01251220703125,
-        "1": 98.10446166992188
+        "1": 144
      },
      "flags": {},
-      "order": 4,
+      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
-          "link": 56
+          "link": 110
        }
      ],
      "outputs": [
@ -228,17 +111,24 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            86
+            106
          ],
          "slot_index": 0,
          "shape": 3
+        },
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "links": null
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. "
+        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
+        1,
+        true
      ]
    },
    {
@ -249,8 +139,8 @@
        "1": 345
      },
      "size": [
-        605.3909898931465,
-        724.5306772953109
+        605.3909912109375,
+        714.2606608072917
      ],
      "flags": {},
      "order": 8,
@ -264,17 +154,20 @@
        {
          "name": "audio",
          "type": "AUDIO",
-          "link": null
+          "link": null,
+          "shape": 7
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
-          "link": null
+          "link": null,
+          "shape": 7
        },
        {
          "name": "vae",
          "type": "VAE",
-          "link": null
+          "link": null,
+          "shape": 7
        }
      ],
      "outputs": [
@ -302,7 +195,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "CogVideoX_Fun_00003.mp4",
+            "filename": "CogVideoX_Fun_00001.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
@ -313,15 +206,191 @@
      }
    },
    {
-      "id": 41,
-      "type": "CogVideoXFunSampler",
+      "id": 36,
+      "type": "LoadImage",
      "pos": {
-        "0": 1058,
-        "1": 345
+        "0": 325,
+        "1": 715
+      },
+      "size": {
+        "0": 432.4361877441406,
+        "1": 361.0254211425781
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            71
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
+        "image"
+      ]
+    },
+    {
+      "id": 1,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": {
+        "0": 602,
+        "1": 53
+      },
+      "size": {
+        "0": 337.8885192871094,
+        "1": 194
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pab_config",
+          "type": "PAB_CONFIG",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "block_edit",
+          "type": "TRANSFORMERBLOCKS",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "lora",
+          "type": "COGLORA",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            104
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "kijai/CogVideoX-Fun-5b",
+        "bf16",
+        "disabled",
+        "disabled",
+        false
+      ]
+    },
+    {
+      "id": 37,
+      "type": "ImageResizeKJ",
+      "pos": {
+        "0": 824,
+        "1": 715
      },
      "size": {
        "0": 315,
-        "1": 282
+        "1": 266
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 71
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          }
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            107
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
+      "widgets_values": [
+        720,
+        480,
+        "lanczos",
+        false,
+        2,
+        0,
+        0,
+        "disabled"
+      ]
+    },
+    {
+      "id": 47,
+      "type": "CogVideoXFunSampler",
+      "pos": {
+        "0": 1068,
+        "1": 198
+      },
+      "size": {
+        "0": 367.79998779296875,
+        "1": 434
      },
      "flags": {},
      "order": 6,
@ -330,27 +399,53 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 84
+          "link": 104
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
-          "link": 85
+          "link": 105
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
-          "link": 86
+          "link": 106
        },
        {
          "name": "start_img",
          "type": "IMAGE",
-          "link": 87
+          "link": 107,
+          "shape": 7
        },
        {
          "name": "end_img",
          "type": "IMAGE",
-          "link": null
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "context_options",
+          "type": "COGCONTEXT",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "tora_trajectory",
+          "type": "TORAFEATURES",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "fastercache",
+          "type": "FASTERCACHEARGS",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "vid2vid_images",
+          "type": "IMAGE",
+          "link": null,
+          "shape": 7
        }
      ],
      "outputs": [
@ -358,18 +453,15 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            89
-          ],
-          "slot_index": 0,
-          "shape": 3
+            108
+          ]
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
-            88
-          ],
-          "shape": 3
+            109
+          ]
        }
      ],
      "properties": {
@ -377,12 +469,15 @@
      },
      "widgets_values": [
        49,
-        512,
+        720,
+        480,
        43,
        "fixed",
-        30,
+        50,
        6,
-        "DPM++"
+        "DDIM",
+        0.0563,
+        1
      ]
    },
    {
@ -411,57 +506,27 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            85
+            105
          ],
          "slot_index": 0,
          "shape": 3
+        },
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "links": [
+            110
+          ],
+          "slot_index": 1
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic."
-      ]
-    },
-    {
-      "id": 36,
-      "type": "LoadImage",
-      "pos": {
-        "0": 325,
-        "1": 715
-      },
-      "size": {
-        "0": 432.4361877441406,
-        "1": 361.0254211425781
-      },
-      "flags": {},
-      "order": 2,
-      "mode": 0,
-      "inputs": [],
-      "outputs": [
-        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": [
-            71
-          ],
-          "slot_index": 0,
-          "shape": 3
-        },
-        {
-          "name": "MASK",
-          "type": "MASK",
-          "links": null,
-          "shape": 3
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "LoadImage"
-      },
-      "widgets_values": [
-        "6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
-        "image"
+        "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+        1,
+        false
      ]
    }
  ],
@ -474,14 +539,6 @@
      0,
      "CLIP"
    ],
-    [
-      56,
-      20,
-      0,
-      31,
-      0,
-      "CLIP"
-    ],
    [
      71,
      36,
@ -490,54 +547,6 @@
      0,
      "IMAGE"
    ],
-    [
-      84,
-      1,
-      0,
-      41,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      85,
-      30,
-      0,
-      41,
-      1,
-      "CONDITIONING"
-    ],
-    [
-      86,
-      31,
-      0,
-      41,
-      2,
-      "CONDITIONING"
-    ],
-    [
-      87,
-      37,
-      0,
-      41,
-      3,
-      "IMAGE"
-    ],
-    [
-      88,
-      41,
-      1,
-      11,
-      1,
-      "LATENT"
-    ],
-    [
-      89,
-      41,
-      0,
-      11,
-      0,
-      "COGVIDEOPIPE"
-    ],
    [
      97,
      11,
@ -545,16 +554,72 @@
      44,
      0,
      "IMAGE"
+    ],
+    [
+      104,
+      1,
+      0,
+      47,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      105,
+      30,
+      0,
+      47,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      106,
+      31,
+      0,
+      47,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      107,
+      37,
+      0,
+      47,
+      3,
+      "IMAGE"
+    ],
+    [
+      108,
+      47,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      109,
+      47,
+      1,
+      11,
+      1,
+      "LATENT"
+    ],
+    [
+      110,
+      30,
+      1,
+      31,
+      0,
+      "CLIP"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.8264462809917361,
+      "scale": 0.8264462809917363,
      "offset": [
-        97.64239267521098,
-        39.894747674006986
+        245.90746806300405,
+        108.93624646284617
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -101,7 +101,33 @@ class CogVideoPABConfig:

        return (pab_config, )

+class CogVideoContextOptions:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
+            "context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
+            "context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
+            "context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
+            "freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
+            }
+        }

+    RETURN_TYPES = ("COGCONTEXT", )
+    RETURN_NAMES = ("context_options",)
+    FUNCTION = "process"
+    CATEGORY = "CogVideoWrapper"
+
+    def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
+        context_options = {
+            "context_schedule":context_schedule,
+            "context_frames":context_frames,
+            "context_stride":context_stride,
+            "context_overlap":context_overlap,
+            "freenoise":freenoise
+        }
+
+        return (context_options,)

 class CogVideoTransformerEdit:
    @classmethod
@ -155,7 +181,8 @@ class CogVideoLoraSelect:
        cog_loras_list.append(cog_lora)
        print(cog_loras_list)
        return (cog_loras_list,)
-    
+
+#region TextEncode    
 class CogVideoEncodePrompt:
    @classmethod
    def INPUT_TYPES(s):
@ -257,8 +284,8 @@ class CogVideoTextEncode:
            }
        }

-    RETURN_TYPES = ("CONDITIONING",)
-    RETURN_NAMES = ("conditioning",)
+    RETURN_TYPES = ("CONDITIONING", "CLIP",)
+    RETURN_NAMES = ("conditioning", "clip")
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

@ -279,7 +306,7 @@ class CogVideoTextEncode:
        if force_offload:
            clip.cond_stage_model.to(offload_device)

-        return (embeds, )
+        return (embeds, clip, )
    
 class CogVideoTextEncodeCombine:
    @classmethod
@ -311,7 +338,8 @@ class CogVideoTextEncodeCombine:
            raise ValueError("Invalid combination mode")

        return (embeds, )
-    
+
+#region ImageEncode    
 class CogVideoImageEncode:
    @classmethod
    def INPUT_TYPES(s):
@ -473,7 +501,8 @@ class CogVideoImageInterpolationEncode:
            vae.to(offload_device)
        
        return ({"samples": final_latents}, )
-    
+
+#region Tora    
 from .tora.traj_utils import process_traj, scale_traj_list_to_256
 from torchvision.utils import flow_to_image

@ -630,8 +659,94 @@ class ToraEncodeOpticalFlow:
        }

        return (tora, )   
-        
+    
+def add_noise_to_reference_video(image, ratio=None):
+    if ratio is None:
+        sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
+        sigma = torch.exp(sigma).to(image.dtype)
+    else:
+        sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
+    
+    image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
+    image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
+    image = image + image_noise
+    return image

+class CogVideoControlImageEncode:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "pipeline": ("COGVIDEOPIPE",),
+            "control_video": ("IMAGE", ),
+            "base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
+            "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
+            "noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
+            },
+        }
+
+    RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
+    RETURN_NAMES = ("control_latents", "width", "height")
+    FUNCTION = "encode"
+    CATEGORY = "CogVideoWrapper"
+
+    def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
+        device = mm.get_torch_device()
+        offload_device = mm.unet_offload_device()
+
+        B, H, W, C = control_video.shape
+
+        vae = pipeline["pipe"].vae
+        vae.enable_slicing()
+
+        if enable_tiling:
+            from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
+            enable_vae_encode_tiling(vae)
+
+        if not pipeline["cpu_offloading"]:
+            vae.to(device)
+
+        # Count most suitable height and width
+        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+
+        control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
+        original_width, original_height = Image.fromarray(control_video[0]).size
+
+        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
+        height, width = [int(x / 16) * 16 for x in closest_size]
+        log.info(f"Closest bucket size: {width}x{height}")
+        
+        video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
+        input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
+
+        control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width) 
+        control_video = control_video.to(dtype=torch.float32)
+        control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
+
+        masked_image = control_video.to(device=device, dtype=vae.dtype)
+        if noise_aug_strength > 0:
+            masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
+        bs = 1
+        new_mask_pixel_values = []
+        for i in range(0, masked_image.shape[0], bs):
+            mask_pixel_values_bs = masked_image[i : i + bs]
+            mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
+            mask_pixel_values_bs = mask_pixel_values_bs.mode()
+            new_mask_pixel_values.append(mask_pixel_values_bs)
+        masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
+        masked_image_latents = masked_image_latents * vae.config.scaling_factor      
+
+        vae.to(offload_device)
+
+        control_latents = {
+            "latents": masked_image_latents,
+            "num_frames" : B,
+            "height" : height,
+            "width" : width,
+        }
+        
+        return (control_latents, width, height)
+            
+#region FasterCache
 class CogVideoXFasterCache:
    @classmethod
    def INPUT_TYPES(s):
@ -659,7 +774,8 @@ class CogVideoXFasterCache:
            "cache_device" : device if cache_device == "main_device" else offload_device
        }
        return (fastercache,)
-    
+
+#region Sampler    
 class CogVideoSampler:
    @classmethod
    def INPUT_TYPES(s):
@ -782,7 +898,43 @@ class CogVideoSampler:
        mm.soft_empty_cache()

        return (pipeline, {"samples": latents})
+
+class CogVideoControlNet:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "controlnet": ("COGVIDECONTROLNETMODEL",),
+            "images": ("IMAGE", ),
+            "control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
+            "control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
+            "control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
+            },
+        }
+
+    RETURN_TYPES = ("COGVIDECONTROLNET",)
+    RETURN_NAMES = ("cogvideo_controlnet",)
+    FUNCTION = "encode"
+    CATEGORY = "CogVideoWrapper"
+
+    def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
+        device = mm.get_torch_device()
+        offload_device = mm.unet_offload_device()
+
+        B, H, W, C = images.shape
+
+        control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
+      
+        controlnet = {
+            "control_model": controlnet,
+            "control_frames": control_frames,
+            "control_weights": control_strength,
+            "control_start": control_start_percent,
+            "control_end": control_end_percent,
+        }
+        
+        return (controlnet,)
    
+#region VideoDecode    
 class CogVideoDecode:
    @classmethod
    def INPUT_TYPES(s):
@ -878,7 +1030,8 @@ class CogVideoXFunResizeToClosestBucket:
        resized_images = resized_images.movedim(1,-1)
        
        return (resized_images, width, height)
-    
+
+#region FunSamplers
 class CogVideoXFunSampler:
    @classmethod
    def INPUT_TYPES(s):
@ -888,7 +1041,8 @@ class CogVideoXFunSampler:
                "positive": ("CONDITIONING", ),
                "negative": ("CONDITIONING", ),
                "video_length": ("INT", {"default": 49, "min": 5, "max": 2048, "step": 4}),
-                "base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
+                "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
+                "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
                "seed": ("INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}),
                "steps": ("INT", {"default": 50, "min": 1, "max": 200, "step": 1}),
                "cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}),
@ -897,7 +1051,6 @@ class CogVideoXFunSampler:
            "optional":{
                "start_img": ("IMAGE",),
                "end_img": ("IMAGE",),
-                "opt_empty_latent": ("LATENT",),
                "noise_aug_strength": ("FLOAT", {"default": 0.0563, "min": 0.0, "max": 1.0, "step": 0.001}),
                "context_options": ("COGCONTEXT", ),
                "tora_trajectory": ("TORAFEATURES", ),
@ -912,8 +1065,8 @@ class CogVideoXFunSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

-    def process(self, pipeline,  positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler, 
-                start_img=None, end_img=None, opt_empty_latent=None, noise_aug_strength=0.0563, context_options=None, fastercache=None, 
+    def process(self, pipeline,  positive, negative, video_length, width, height, seed, steps, cfg, scheduler, 
+                start_img=None, end_img=None, noise_aug_strength=0.0563, context_options=None, fastercache=None, 
                tora_trajectory=None, vid2vid_images=None, vid2vid_denoise=1.0):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
@ -929,23 +1082,13 @@ class CogVideoXFunSampler:

        mm.soft_empty_cache()

-        aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
        #vid2vid
        if vid2vid_images is not None:
            validation_video = np.array(vid2vid_images.cpu().numpy() * 255, np.uint8)
-            original_width, original_height = Image.fromarray(validation_video[0]).size
        #img2vid
        elif start_img is not None:
            start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
-            end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
-            # Count most suitable height and width
-            original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
-        else:
-            original_width = opt_empty_latent["samples"][0].shape[-1] * 8
-            original_height = opt_empty_latent["samples"][0].shape[-2] * 8
-        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
-        height, width = [int(x / 16) * 16 for x in closest_size]
-        log.info(f"Closest bucket size: {width}x{height}")
+            end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None       
        
        # Load Sampler
        if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
@ -1045,156 +1188,6 @@ class CogVideoXFunVid2VidSampler:
    DEPRECATED = True
    def process(self):
        return ()
-
-def add_noise_to_reference_video(image, ratio=None):
-    if ratio is None:
-        sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
-        sigma = torch.exp(sigma).to(image.dtype)
-    else:
-        sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
-    
-    image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
-    image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
-    image = image + image_noise
-    return image
-
-class CogVideoControlImageEncode:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "pipeline": ("COGVIDEOPIPE",),
-            "control_video": ("IMAGE", ),
-            "base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
-            "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
-            "noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
-            },
-        }
-
-    RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
-    RETURN_NAMES = ("control_latents", "width", "height")
-    FUNCTION = "encode"
-    CATEGORY = "CogVideoWrapper"
-
-    def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
-        device = mm.get_torch_device()
-        offload_device = mm.unet_offload_device()
-
-        B, H, W, C = control_video.shape
-
-        vae = pipeline["pipe"].vae
-        vae.enable_slicing()
-
-        if enable_tiling:
-            from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
-            enable_vae_encode_tiling(vae)
-
-        if not pipeline["cpu_offloading"]:
-            vae.to(device)
-
-        # Count most suitable height and width
-        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
-
-        control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
-        original_width, original_height = Image.fromarray(control_video[0]).size
-
-        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
-        height, width = [int(x / 16) * 16 for x in closest_size]
-        log.info(f"Closest bucket size: {width}x{height}")
-        
-        video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
-        input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
-
-        control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width) 
-        control_video = control_video.to(dtype=torch.float32)
-        control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
-
-        masked_image = control_video.to(device=device, dtype=vae.dtype)
-        if noise_aug_strength > 0:
-            masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
-        bs = 1
-        new_mask_pixel_values = []
-        for i in range(0, masked_image.shape[0], bs):
-            mask_pixel_values_bs = masked_image[i : i + bs]
-            mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
-            mask_pixel_values_bs = mask_pixel_values_bs.mode()
-            new_mask_pixel_values.append(mask_pixel_values_bs)
-        masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
-        masked_image_latents = masked_image_latents * vae.config.scaling_factor      
-
-        vae.to(offload_device)
-
-        control_latents = {
-            "latents": masked_image_latents,
-            "num_frames" : B,
-            "height" : height,
-            "width" : width,
-        }
-        
-        return (control_latents, width, height)
-    
-class CogVideoControlNet:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "controlnet": ("COGVIDECONTROLNETMODEL",),
-            "images": ("IMAGE", ),
-            "control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
-            "control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-            "control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-            },
-        }
-
-    RETURN_TYPES = ("COGVIDECONTROLNET",)
-    RETURN_NAMES = ("cogvideo_controlnet",)
-    FUNCTION = "encode"
-    CATEGORY = "CogVideoWrapper"
-
-    def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
-        device = mm.get_torch_device()
-        offload_device = mm.unet_offload_device()
-
-        B, H, W, C = images.shape
-
-        control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
-      
-        controlnet = {
-            "control_model": controlnet,
-            "control_frames": control_frames,
-            "control_weights": control_strength,
-            "control_start": control_start_percent,
-            "control_end": control_end_percent,
-        }
-        
-        return (controlnet,)
-
-    
-class CogVideoContextOptions:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
-            "context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
-            "context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
-            "context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
-            "freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
-            }
-        }
-
-    RETURN_TYPES = ("COGCONTEXT", )
-    RETURN_NAMES = ("context_options",)
-    FUNCTION = "process"
-    CATEGORY = "CogVideoWrapper"
-
-    def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
-        context_options = {
-            "context_schedule":context_schedule,
-            "context_frames":context_frames,
-            "context_stride":context_stride,
-            "context_overlap":context_overlap,
-            "freenoise":freenoise
-        }
-
-        return (context_options,)
            
 class CogVideoXFunControlSampler:
    @classmethod