Refactor Fun sampler to be easier to use with Tora (breaks old workflows!)

The FunSampler node in old workflows needs to be remade. I moved the forced bucket resize to it's own node if anyone still wants to use that.
2026-05-31 16:27:18 +08:00 · 2024-11-07 13:01:34 +02:00 · 2024-11-07 13:01:34 +02:00 · 9202921920
commit 9202921920
parent 666f7832f9
4 changed files with 2036 additions and 611 deletions
--- a/examples/cogvideox_fun_img2vid_tora_01.json
+++ b/examples/cogvideox_fun_img2vid_tora_01.json
--- a/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_02.json
+++ b/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_02.json
@ -1,6 +1,6 @@
 {
-  "last_node_id": 48,
+  "last_node_id": 51,
-  "last_link_id": 101,
+  "last_link_id": 114,
  "nodes": [
    {
      "id": 20,
@ -22,8 +22,7 @@
          "name": "CLIP",
          "type": "CLIP",
          "links": [
-            54,
+            54
            56
          ],
          "slot_index": 0,
          "shape": 3
@ -46,16 +45,16 @@
      },
      "size": {
        "0": 463.01251220703125,
-        "1": 124
+        "1": 144
      },
      "flags": {},
-      "order": 4,
+      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
-          "link": 56
+          "link": 108
        }
      ],
      "outputs": [
@ -63,10 +62,15 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            86
+            111
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "clip",
          "type": "CLIP",
          "links": null
        }
      ],
      "properties": {
@ -87,7 +91,7 @@
      },
      "size": [
        855.81494140625,
-        927.6441243489584
+        881.2099609375
      ],
      "flags": {},
      "order": 8,
@ -101,17 +105,20 @@
        {
          "name": "audio",
          "type": "AUDIO",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
          "name": "vae",
          "type": "VAE",
-          "link": null
+          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
@ -139,7 +146,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "CogVideoX_Fun_00012.mp4",
+            "filename": "CogVideoX_Fun_00003.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
@ -149,61 +156,12 @@
        }
      }
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": {
        "0": 1448,
        "1": 345
      },
      "size": {
        "0": 300.396484375,
        "1": 198
      },
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 89
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 88
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            97
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      },
      "widgets_values": [
        true,
        240,
        360,
        0.2,
        0.2,
        true
      ]
    },
    {
      "id": 36,
      "type": "LoadImage",
      "pos": {
-        "0": 364,
+        "0": 227,
-        "1": 715
+        "1": 700
      },
      "size": {
        "0": 391.3421325683594,
@ -242,15 +200,15 @@
      "id": 37,
      "type": "ImageResizeKJ",
      "pos": {
-        "0": 824,
+        "0": 688,
-        "1": 715
+        "1": 708
      },
      "size": {
        "0": 315,
        "1": 266
      },
      "flags": {},
-      "order": 5,
+      "order": 4,
      "mode": 0,
      "inputs": [
        {
@ -261,7 +219,8 @@
        {
          "name": "get_image_size",
          "type": "IMAGE",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
          "name": "width_input",
@ -285,7 +244,7 @@
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
-            87
+            112
          ],
          "slot_index": 0,
          "shape": 3
@ -317,6 +276,55 @@
        "disabled"
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": {
        "0": 1477,
        "1": 344
      },
      "size": {
        "0": 300.396484375,
        "1": 198
      },
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 113
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 114
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            97
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      },
      "widgets_values": [
        true,
        240,
        360,
        0.2,
        0.2,
        true
      ]
    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
@ -343,10 +351,18 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            85
+            110
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "clip",
          "type": "CLIP",
          "links": [
            108
          ],
          "slot_index": 1
        }
      ],
      "properties": {
@ -355,55 +371,19 @@
      "widgets_values": [
        "majestic stag grazing in a forest and basking in the setting sun",
        1,
-        true
+        false
      ]
    },
    {
-      "id": 48,
+      "id": 51,
      "type": "DownloadAndLoadCogVideoGGUFModel",
      "pos": {
        "0": 584,
        "1": 103
      },
      "size": {
        "0": 378,
        "1": 130
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            101
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
      },
      "widgets_values": [
        "CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
        "bf16",
        false,
        "offload_device"
      ]
    },
    {
      "id": 41,
      "type": "CogVideoXFunSampler",
      "pos": {
        "0": 1058,
        "1": 345
      },
      "size": {
-        "0": 315,
+        "0": 367.79998779296875,
-        "1": 302
+        "1": 434
      },
      "flags": {},
      "order": 6,
@ -412,32 +392,53 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 101
+          "link": 109
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
-          "link": 85
+          "link": 110
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
-          "link": 86
+          "link": 111
        },
        {
          "name": "start_img",
          "type": "IMAGE",
-          "link": 87
+          "link": 112,
          "shape": 7
        },
        {
          "name": "end_img",
          "type": "IMAGE",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
-          "name": "opt_empty_latent",
+          "name": "context_options",
-          "type": "LATENT",
+          "type": "COGCONTEXT",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
          "name": "tora_trajectory",
          "type": "TORAFEATURES",
          "link": null,
          "shape": 7
        },
        {
          "name": "fastercache",
          "type": "FASTERCACHEARGS",
          "link": null,
          "shape": 7
        },
        {
          "name": "vid2vid_images",
          "type": "IMAGE",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
@ -445,18 +446,15 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            89
+            113
-          ],
+          ]
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
-            88
+            114
-          ],
+          ]
          "shape": 3
        }
      ],
      "properties": {
@ -464,12 +462,66 @@
      },
      "widgets_values": [
        49,
-        512,
+        720,
-        44,
+        480,
-        "fixed",
+        43,
-        30,
+        "randomize",
        50,
        6,
-        "CogVideoXDPMScheduler"
+        "DDIM",
        0.0563,
        1
      ]
    },
    {
      "id": 48,
      "type": "DownloadAndLoadCogVideoGGUFModel",
      "pos": {
        "0": 585,
        "1": 34
      },
      "size": {
        "0": 378,
        "1": 198
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "pab_config",
          "type": "PAB_CONFIG",
          "link": null,
          "shape": 7
        },
        {
          "name": "block_edit",
          "type": "TRANSFORMERBLOCKS",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            109
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
      },
      "widgets_values": [
        "CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors",
        "bf16",
        false,
        "offload_device",
        false,
        "disabled"
      ]
    }
  ],
@ -482,14 +534,6 @@
      0,
      "CLIP"
    ],
    [
      56,
      20,
      0,
      31,
      0,
      "CLIP"
    ],
    [
      71,
      36,
@ -498,46 +542,6 @@
      0,
      "IMAGE"
    ],
    [
      85,
      30,
      0,
      41,
      1,
      "CONDITIONING"
    ],
    [
      86,
      31,
      0,
      41,
      2,
      "CONDITIONING"
    ],
    [
      87,
      37,
      0,
      41,
      3,
      "IMAGE"
    ],
    [
      88,
      41,
      1,
      11,
      1,
      "LATENT"
    ],
    [
      89,
      41,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      97,
      11,
@ -547,22 +551,70 @@
      "IMAGE"
    ],
    [
-      101,
+      108,
      30,
      1,
      31,
      0,
      "CLIP"
    ],
    [
      109,
      48,
      0,
-      41,
+      51,
      0,
      "COGVIDEOPIPE"
    ],
    [
      110,
      30,
      0,
      51,
      1,
      "CONDITIONING"
    ],
    [
      111,
      31,
      0,
      51,
      2,
      "CONDITIONING"
    ],
    [
      112,
      37,
      0,
      51,
      3,
      "IMAGE"
    ],
    [
      113,
      51,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      114,
      51,
      1,
      11,
      1,
      "LATENT"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.7627768444385654,
+      "scale": 0.7513148009015784,
      "offset": [
-        62.58315607223924,
+        724.7448506313632,
-        102.05205752424705
+        128.336592104936
      ]
    }
  },
--- a/examples/cogvidex_fun_i2v_example_02.json
+++ b/examples/cogvidex_fun_i2v_example_02.json
@ -1,6 +1,6 @@
 {
-  "last_node_id": 45,
+  "last_node_id": 47,
-  "last_link_id": 97,
+  "last_link_id": 110,
  "nodes": [
    {
      "id": 20,
@ -22,8 +22,7 @@
          "name": "CLIP",
          "type": "CLIP",
          "links": [
-            54,
+            54
            56
          ],
          "slot_index": 0,
          "shape": 3
@ -37,85 +36,6 @@
        "sd3"
      ]
    },
    {
      "id": 37,
      "type": "ImageResizeKJ",
      "pos": {
        "0": 824,
        "1": 715
      },
      "size": {
        "0": 315,
        "1": 266
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 71
        },
        {
          "name": "get_image_size",
          "type": "IMAGE",
          "link": null
        },
        {
          "name": "width_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "width_input"
          }
        },
        {
          "name": "height_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "height_input"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            87
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "height",
          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
        720,
        480,
        "nearest-exact",
        false,
        2,
        0,
        0,
        "disabled"
      ]
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
@ -134,12 +54,12 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 89
+          "link": 108
        },
        {
          "name": "samples",
          "type": "LATENT",
-          "link": 88
+          "link": 109
        }
      ],
      "outputs": [
@ -165,43 +85,6 @@
        true
      ]
    },
    {
      "id": 1,
      "type": "DownloadAndLoadCogVideoModel",
      "pos": {
        "0": 642,
        "1": 90
      },
      "size": {
        "0": 337.8885192871094,
        "1": 154
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            84
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoModel"
      },
      "widgets_values": [
        "kijai/CogVideoX-Fun-5b",
        "bf16",
        "disabled",
        "disabled",
        false
      ]
    },
    {
      "id": 31,
      "type": "CogVideoTextEncode",
@ -211,16 +94,16 @@
      },
      "size": {
        "0": 463.01251220703125,
-        "1": 98.10446166992188
+        "1": 144
      },
      "flags": {},
-      "order": 4,
+      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
-          "link": 56
+          "link": 110
        }
      ],
      "outputs": [
@ -228,17 +111,24 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            86
+            106
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "clip",
          "type": "CLIP",
          "links": null
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. "
+        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
        1,
        true
      ]
    },
    {
@ -249,8 +139,8 @@
        "1": 345
      },
      "size": [
-        605.3909898931465,
+        605.3909912109375,
-        724.5306772953109
+        714.2606608072917
      ],
      "flags": {},
      "order": 8,
@ -264,17 +154,20 @@
        {
          "name": "audio",
          "type": "AUDIO",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
          "name": "vae",
          "type": "VAE",
-          "link": null
+          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
@ -302,7 +195,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "CogVideoX_Fun_00003.mp4",
+            "filename": "CogVideoX_Fun_00001.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
@ -313,15 +206,191 @@
      }
    },
    {
-      "id": 41,
+      "id": 36,
-      "type": "CogVideoXFunSampler",
+      "type": "LoadImage",
      "pos": {
-        "0": 1058,
+        "0": 325,
-        "1": 345
+        "1": 715
      },
      "size": {
        "0": 432.4361877441406,
        "1": 361.0254211425781
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            71
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
        "image"
      ]
    },
    {
      "id": 1,
      "type": "DownloadAndLoadCogVideoModel",
      "pos": {
        "0": 602,
        "1": 53
      },
      "size": {
        "0": 337.8885192871094,
        "1": 194
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "pab_config",
          "type": "PAB_CONFIG",
          "link": null,
          "shape": 7
        },
        {
          "name": "block_edit",
          "type": "TRANSFORMERBLOCKS",
          "link": null,
          "shape": 7
        },
        {
          "name": "lora",
          "type": "COGLORA",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            104
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoModel"
      },
      "widgets_values": [
        "kijai/CogVideoX-Fun-5b",
        "bf16",
        "disabled",
        "disabled",
        false
      ]
    },
    {
      "id": 37,
      "type": "ImageResizeKJ",
      "pos": {
        "0": 824,
        "1": 715
      },
      "size": {
        "0": 315,
-        "1": 282
+        "1": 266
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 71
        },
        {
          "name": "get_image_size",
          "type": "IMAGE",
          "link": null,
          "shape": 7
        },
        {
          "name": "width_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "width_input"
          }
        },
        {
          "name": "height_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "height_input"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            107
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "height",
          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
        720,
        480,
        "lanczos",
        false,
        2,
        0,
        0,
        "disabled"
      ]
    },
    {
      "id": 47,
      "type": "CogVideoXFunSampler",
      "pos": {
        "0": 1068,
        "1": 198
      },
      "size": {
        "0": 367.79998779296875,
        "1": 434
      },
      "flags": {},
      "order": 6,
@ -330,27 +399,53 @@
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 84
+          "link": 104
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
-          "link": 85
+          "link": 105
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
-          "link": 86
+          "link": 106
        },
        {
          "name": "start_img",
          "type": "IMAGE",
-          "link": 87
+          "link": 107,
          "shape": 7
        },
        {
          "name": "end_img",
          "type": "IMAGE",
-          "link": null
+          "link": null,
          "shape": 7
        },
        {
          "name": "context_options",
          "type": "COGCONTEXT",
          "link": null,
          "shape": 7
        },
        {
          "name": "tora_trajectory",
          "type": "TORAFEATURES",
          "link": null,
          "shape": 7
        },
        {
          "name": "fastercache",
          "type": "FASTERCACHEARGS",
          "link": null,
          "shape": 7
        },
        {
          "name": "vid2vid_images",
          "type": "IMAGE",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
@ -358,18 +453,15 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            89
+            108
-          ],
+          ]
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
-            88
+            109
-          ],
+          ]
          "shape": 3
        }
      ],
      "properties": {
@ -377,12 +469,15 @@
      },
      "widgets_values": [
        49,
-        512,
+        720,
        480,
        43,
        "fixed",
-        30,
+        50,
        6,
-        "DPM++"
+        "DDIM",
        0.0563,
        1
      ]
    },
    {
@ -411,57 +506,27 @@
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
-            85
+            105
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "clip",
          "type": "CLIP",
          "links": [
            110
          ],
          "slot_index": 1
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
-        "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic."
+        "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
-      ]
+        1,
-    },
+        false
    {
      "id": 36,
      "type": "LoadImage",
      "pos": {
        "0": 325,
        "1": 715
      },
      "size": {
        "0": 432.4361877441406,
        "1": 361.0254211425781
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            71
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
        "image"
      ]
    }
  ],
@ -474,14 +539,6 @@
      0,
      "CLIP"
    ],
    [
      56,
      20,
      0,
      31,
      0,
      "CLIP"
    ],
    [
      71,
      36,
@ -490,54 +547,6 @@
      0,
      "IMAGE"
    ],
    [
      84,
      1,
      0,
      41,
      0,
      "COGVIDEOPIPE"
    ],
    [
      85,
      30,
      0,
      41,
      1,
      "CONDITIONING"
    ],
    [
      86,
      31,
      0,
      41,
      2,
      "CONDITIONING"
    ],
    [
      87,
      37,
      0,
      41,
      3,
      "IMAGE"
    ],
    [
      88,
      41,
      1,
      11,
      1,
      "LATENT"
    ],
    [
      89,
      41,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      97,
      11,
@ -545,16 +554,72 @@
      44,
      0,
      "IMAGE"
    ],
    [
      104,
      1,
      0,
      47,
      0,
      "COGVIDEOPIPE"
    ],
    [
      105,
      30,
      0,
      47,
      1,
      "CONDITIONING"
    ],
    [
      106,
      31,
      0,
      47,
      2,
      "CONDITIONING"
    ],
    [
      107,
      37,
      0,
      47,
      3,
      "IMAGE"
    ],
    [
      108,
      47,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      109,
      47,
      1,
      11,
      1,
      "LATENT"
    ],
    [
      110,
      30,
      1,
      31,
      0,
      "CLIP"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 0.8264462809917361,
+      "scale": 0.8264462809917363,
      "offset": [
-        97.64239267521098,
+        245.90746806300405,
-        39.894747674006986
+        108.93624646284617
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -101,7 +101,33 @@ class CogVideoPABConfig:
        return (pab_config, )
 class CogVideoContextOptions:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
            "context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
            "context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
            "context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
            "freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
            }
        }
    RETURN_TYPES = ("COGCONTEXT", )
    RETURN_NAMES = ("context_options",)
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"
    def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
        context_options = {
            "context_schedule":context_schedule,
            "context_frames":context_frames,
            "context_stride":context_stride,
            "context_overlap":context_overlap,
            "freenoise":freenoise
        }
        return (context_options,)
 class CogVideoTransformerEdit:
    @classmethod
@ -156,6 +182,7 @@ class CogVideoLoraSelect:
        print(cog_loras_list)
        return (cog_loras_list,)
 #region TextEncode    
 class CogVideoEncodePrompt:
    @classmethod
    def INPUT_TYPES(s):
@ -257,8 +284,8 @@ class CogVideoTextEncode:
            }
        }
-    RETURN_TYPES = ("CONDITIONING",)
+    RETURN_TYPES = ("CONDITIONING", "CLIP",)
-    RETURN_NAMES = ("conditioning",)
+    RETURN_NAMES = ("conditioning", "clip")
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"
@ -279,7 +306,7 @@ class CogVideoTextEncode:
        if force_offload:
            clip.cond_stage_model.to(offload_device)
-        return (embeds, )
+        return (embeds, clip, )
 class CogVideoTextEncodeCombine:
    @classmethod
@ -312,6 +339,7 @@ class CogVideoTextEncodeCombine:
        return (embeds, )
 #region ImageEncode    
 class CogVideoImageEncode:
    @classmethod
    def INPUT_TYPES(s):
@ -474,6 +502,7 @@ class CogVideoImageInterpolationEncode:
        return ({"samples": final_latents}, )
 #region Tora    
 from .tora.traj_utils import process_traj, scale_traj_list_to_256
 from torchvision.utils import flow_to_image
@ -631,7 +660,93 @@ class ToraEncodeOpticalFlow:
        return (tora, )   
 def add_noise_to_reference_video(image, ratio=None):
    if ratio is None:
        sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
        sigma = torch.exp(sigma).to(image.dtype)
    else:
        sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
    image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
    image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
    image = image + image_noise
    return image
 class CogVideoControlImageEncode:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "pipeline": ("COGVIDEOPIPE",),
            "control_video": ("IMAGE", ),
            "base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
            "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
            "noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
            },
        }
    RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
    RETURN_NAMES = ("control_latents", "width", "height")
    FUNCTION = "encode"
    CATEGORY = "CogVideoWrapper"
    def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        B, H, W, C = control_video.shape
        vae = pipeline["pipe"].vae
        vae.enable_slicing()
        if enable_tiling:
            from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
            enable_vae_encode_tiling(vae)
        if not pipeline["cpu_offloading"]:
            vae.to(device)
        # Count most suitable height and width
        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
        control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
        original_width, original_height = Image.fromarray(control_video[0]).size
        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
        height, width = [int(x / 16) * 16 for x in closest_size]
        log.info(f"Closest bucket size: {width}x{height}")
        video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
        input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
        control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width) 
        control_video = control_video.to(dtype=torch.float32)
        control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
        masked_image = control_video.to(device=device, dtype=vae.dtype)
        if noise_aug_strength > 0:
            masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
        bs = 1
        new_mask_pixel_values = []
        for i in range(0, masked_image.shape[0], bs):
            mask_pixel_values_bs = masked_image[i : i + bs]
            mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
            mask_pixel_values_bs = mask_pixel_values_bs.mode()
            new_mask_pixel_values.append(mask_pixel_values_bs)
        masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
        masked_image_latents = masked_image_latents * vae.config.scaling_factor      
        vae.to(offload_device)
        control_latents = {
            "latents": masked_image_latents,
            "num_frames" : B,
            "height" : height,
            "width" : width,
        }
        return (control_latents, width, height)
 #region FasterCache
 class CogVideoXFasterCache:
    @classmethod
    def INPUT_TYPES(s):
@ -660,6 +775,7 @@ class CogVideoXFasterCache:
        }
        return (fastercache,)
 #region Sampler    
 class CogVideoSampler:
    @classmethod
    def INPUT_TYPES(s):
@ -783,6 +899,42 @@ class CogVideoSampler:
        return (pipeline, {"samples": latents})
 class CogVideoControlNet:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "controlnet": ("COGVIDECONTROLNETMODEL",),
            "images": ("IMAGE", ),
            "control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
            "control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
            "control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
            },
        }
    RETURN_TYPES = ("COGVIDECONTROLNET",)
    RETURN_NAMES = ("cogvideo_controlnet",)
    FUNCTION = "encode"
    CATEGORY = "CogVideoWrapper"
    def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        B, H, W, C = images.shape
        control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
        controlnet = {
            "control_model": controlnet,
            "control_frames": control_frames,
            "control_weights": control_strength,
            "control_start": control_start_percent,
            "control_end": control_end_percent,
        }
        return (controlnet,)
 #region VideoDecode    
 class CogVideoDecode:
    @classmethod
    def INPUT_TYPES(s):
@ -879,6 +1031,7 @@ class CogVideoXFunResizeToClosestBucket:
        return (resized_images, width, height)
 #region FunSamplers
 class CogVideoXFunSampler:
    @classmethod
    def INPUT_TYPES(s):
@ -888,7 +1041,8 @@ class CogVideoXFunSampler:
                "positive": ("CONDITIONING", ),
                "negative": ("CONDITIONING", ),
                "video_length": ("INT", {"default": 49, "min": 5, "max": 2048, "step": 4}),
-                "base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
+                "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
                "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
                "seed": ("INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}),
                "steps": ("INT", {"default": 50, "min": 1, "max": 200, "step": 1}),
                "cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}),
@ -897,7 +1051,6 @@ class CogVideoXFunSampler:
            "optional":{
                "start_img": ("IMAGE",),
                "end_img": ("IMAGE",),
                "opt_empty_latent": ("LATENT",),
                "noise_aug_strength": ("FLOAT", {"default": 0.0563, "min": 0.0, "max": 1.0, "step": 0.001}),
                "context_options": ("COGCONTEXT", ),
                "tora_trajectory": ("TORAFEATURES", ),
@ -912,8 +1065,8 @@ class CogVideoXFunSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"
-    def process(self, pipeline,  positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler, 
+    def process(self, pipeline,  positive, negative, video_length, width, height, seed, steps, cfg, scheduler, 
-                start_img=None, end_img=None, opt_empty_latent=None, noise_aug_strength=0.0563, context_options=None, fastercache=None, 
+                start_img=None, end_img=None, noise_aug_strength=0.0563, context_options=None, fastercache=None, 
                tora_trajectory=None, vid2vid_images=None, vid2vid_denoise=1.0):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
@ -929,23 +1082,13 @@ class CogVideoXFunSampler:
        mm.soft_empty_cache()
        aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
        #vid2vid
        if vid2vid_images is not None:
            validation_video = np.array(vid2vid_images.cpu().numpy() * 255, np.uint8)
            original_width, original_height = Image.fromarray(validation_video[0]).size
        #img2vid
        elif start_img is not None:
            start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
            end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None       
            # Count most suitable height and width
            original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
        else:
            original_width = opt_empty_latent["samples"][0].shape[-1] * 8
            original_height = opt_empty_latent["samples"][0].shape[-2] * 8
        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
        height, width = [int(x / 16) * 16 for x in closest_size]
        log.info(f"Closest bucket size: {width}x{height}")
        # Load Sampler
        if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
@ -1046,156 +1189,6 @@ class CogVideoXFunVid2VidSampler:
    def process(self):
        return ()
 def add_noise_to_reference_video(image, ratio=None):
    if ratio is None:
        sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
        sigma = torch.exp(sigma).to(image.dtype)
    else:
        sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
    image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
    image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
    image = image + image_noise
    return image
 class CogVideoControlImageEncode:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "pipeline": ("COGVIDEOPIPE",),
            "control_video": ("IMAGE", ),
            "base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
            "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
            "noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
            },
        }
    RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
    RETURN_NAMES = ("control_latents", "width", "height")
    FUNCTION = "encode"
    CATEGORY = "CogVideoWrapper"
    def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        B, H, W, C = control_video.shape
        vae = pipeline["pipe"].vae
        vae.enable_slicing()
        if enable_tiling:
            from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
            enable_vae_encode_tiling(vae)
        if not pipeline["cpu_offloading"]:
            vae.to(device)
        # Count most suitable height and width
        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
        control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
        original_width, original_height = Image.fromarray(control_video[0]).size
        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
        height, width = [int(x / 16) * 16 for x in closest_size]
        log.info(f"Closest bucket size: {width}x{height}")
        video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
        input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
        control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width) 
        control_video = control_video.to(dtype=torch.float32)
        control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
        masked_image = control_video.to(device=device, dtype=vae.dtype)
        if noise_aug_strength > 0:
            masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
        bs = 1
        new_mask_pixel_values = []
        for i in range(0, masked_image.shape[0], bs):
            mask_pixel_values_bs = masked_image[i : i + bs]
            mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
            mask_pixel_values_bs = mask_pixel_values_bs.mode()
            new_mask_pixel_values.append(mask_pixel_values_bs)
        masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
        masked_image_latents = masked_image_latents * vae.config.scaling_factor      
        vae.to(offload_device)
        control_latents = {
            "latents": masked_image_latents,
            "num_frames" : B,
            "height" : height,
            "width" : width,
        }
        return (control_latents, width, height)
 class CogVideoControlNet:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "controlnet": ("COGVIDECONTROLNETMODEL",),
            "images": ("IMAGE", ),
            "control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
            "control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
            "control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
            },
        }
    RETURN_TYPES = ("COGVIDECONTROLNET",)
    RETURN_NAMES = ("cogvideo_controlnet",)
    FUNCTION = "encode"
    CATEGORY = "CogVideoWrapper"
    def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        B, H, W, C = images.shape
        control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
        controlnet = {
            "control_model": controlnet,
            "control_frames": control_frames,
            "control_weights": control_strength,
            "control_start": control_start_percent,
            "control_end": control_end_percent,
        }
        return (controlnet,)
 class CogVideoContextOptions:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
            "context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
            "context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
            "context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
            "freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
            }
        }
    RETURN_TYPES = ("COGCONTEXT", )
    RETURN_NAMES = ("context_options",)
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"
    def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
        context_options = {
            "context_schedule":context_schedule,
            "context_frames":context_frames,
            "context_stride":context_stride,
            "context_overlap":context_overlap,
            "freenoise":freenoise
        }
        return (context_options,)
 class CogVideoXFunControlSampler:
    @classmethod
    def INPUT_TYPES(s):