Refactor Fun sampler to be easier to use with Tora (breaks old workflows!)

The FunSampler node in old workflows needs to be remade. I moved the forced bucket resize to it's own node if anyone still wants to use that.
This commit is contained in:
kijai 2024-11-07 13:01:34 +02:00
parent 666f7832f9
commit 9202921920
4 changed files with 2036 additions and 611 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
{ {
"last_node_id": 48, "last_node_id": 51,
"last_link_id": 101, "last_link_id": 114,
"nodes": [ "nodes": [
{ {
"id": 20, "id": 20,
@ -22,8 +22,7 @@
"name": "CLIP", "name": "CLIP",
"type": "CLIP", "type": "CLIP",
"links": [ "links": [
54, 54
56
], ],
"slot_index": 0, "slot_index": 0,
"shape": 3 "shape": 3
@ -46,16 +45,16 @@
}, },
"size": { "size": {
"0": 463.01251220703125, "0": 463.01251220703125,
"1": 124 "1": 144
}, },
"flags": {}, "flags": {},
"order": 4, "order": 5,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "clip", "name": "clip",
"type": "CLIP", "type": "CLIP",
"link": 56 "link": 108
} }
], ],
"outputs": [ "outputs": [
@ -63,10 +62,15 @@
"name": "conditioning", "name": "conditioning",
"type": "CONDITIONING", "type": "CONDITIONING",
"links": [ "links": [
86 111
], ],
"slot_index": 0, "slot_index": 0,
"shape": 3 "shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": null
} }
], ],
"properties": { "properties": {
@ -87,7 +91,7 @@
}, },
"size": [ "size": [
855.81494140625, 855.81494140625,
927.6441243489584 881.2099609375
], ],
"flags": {}, "flags": {},
"order": 8, "order": 8,
@ -101,17 +105,20 @@
{ {
"name": "audio", "name": "audio",
"type": "AUDIO", "type": "AUDIO",
"link": null "link": null,
"shape": 7
}, },
{ {
"name": "meta_batch", "name": "meta_batch",
"type": "VHS_BatchManager", "type": "VHS_BatchManager",
"link": null "link": null,
"shape": 7
}, },
{ {
"name": "vae", "name": "vae",
"type": "VAE", "type": "VAE",
"link": null "link": null,
"shape": 7
} }
], ],
"outputs": [ "outputs": [
@ -139,7 +146,7 @@
"hidden": false, "hidden": false,
"paused": false, "paused": false,
"params": { "params": {
"filename": "CogVideoX_Fun_00012.mp4", "filename": "CogVideoX_Fun_00003.mp4",
"subfolder": "", "subfolder": "",
"type": "temp", "type": "temp",
"format": "video/h264-mp4", "format": "video/h264-mp4",
@ -149,61 +156,12 @@
} }
} }
}, },
{
"id": 11,
"type": "CogVideoDecode",
"pos": {
"0": 1448,
"1": 345
},
"size": {
"0": 300.396484375,
"1": 198
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 89
},
{
"name": "samples",
"type": "LATENT",
"link": 88
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
97
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
},
"widgets_values": [
true,
240,
360,
0.2,
0.2,
true
]
},
{ {
"id": 36, "id": 36,
"type": "LoadImage", "type": "LoadImage",
"pos": { "pos": {
"0": 364, "0": 227,
"1": 715 "1": 700
}, },
"size": { "size": {
"0": 391.3421325683594, "0": 391.3421325683594,
@ -242,15 +200,15 @@
"id": 37, "id": 37,
"type": "ImageResizeKJ", "type": "ImageResizeKJ",
"pos": { "pos": {
"0": 824, "0": 688,
"1": 715 "1": 708
}, },
"size": { "size": {
"0": 315, "0": 315,
"1": 266 "1": 266
}, },
"flags": {}, "flags": {},
"order": 5, "order": 4,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
@ -261,7 +219,8 @@
{ {
"name": "get_image_size", "name": "get_image_size",
"type": "IMAGE", "type": "IMAGE",
"link": null "link": null,
"shape": 7
}, },
{ {
"name": "width_input", "name": "width_input",
@ -285,7 +244,7 @@
"name": "IMAGE", "name": "IMAGE",
"type": "IMAGE", "type": "IMAGE",
"links": [ "links": [
87 112
], ],
"slot_index": 0, "slot_index": 0,
"shape": 3 "shape": 3
@ -317,6 +276,55 @@
"disabled" "disabled"
] ]
}, },
{
"id": 11,
"type": "CogVideoDecode",
"pos": {
"0": 1477,
"1": 344
},
"size": {
"0": 300.396484375,
"1": 198
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 113
},
{
"name": "samples",
"type": "LATENT",
"link": 114
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
97
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
},
"widgets_values": [
true,
240,
360,
0.2,
0.2,
true
]
},
{ {
"id": 30, "id": 30,
"type": "CogVideoTextEncode", "type": "CogVideoTextEncode",
@ -343,10 +351,18 @@
"name": "conditioning", "name": "conditioning",
"type": "CONDITIONING", "type": "CONDITIONING",
"links": [ "links": [
85 110
], ],
"slot_index": 0, "slot_index": 0,
"shape": 3 "shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": [
108
],
"slot_index": 1
} }
], ],
"properties": { "properties": {
@ -355,55 +371,19 @@
"widgets_values": [ "widgets_values": [
"majestic stag grazing in a forest and basking in the setting sun", "majestic stag grazing in a forest and basking in the setting sun",
1, 1,
true false
] ]
}, },
{ {
"id": 48, "id": 51,
"type": "DownloadAndLoadCogVideoGGUFModel",
"pos": {
"0": 584,
"1": 103
},
"size": {
"0": 378,
"1": 130
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
101
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
},
"widgets_values": [
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
"bf16",
false,
"offload_device"
]
},
{
"id": 41,
"type": "CogVideoXFunSampler", "type": "CogVideoXFunSampler",
"pos": { "pos": {
"0": 1058, "0": 1058,
"1": 345 "1": 345
}, },
"size": { "size": {
"0": 315, "0": 367.79998779296875,
"1": 302 "1": 434
}, },
"flags": {}, "flags": {},
"order": 6, "order": 6,
@ -412,32 +392,53 @@
{ {
"name": "pipeline", "name": "pipeline",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"link": 101 "link": 109
}, },
{ {
"name": "positive", "name": "positive",
"type": "CONDITIONING", "type": "CONDITIONING",
"link": 85 "link": 110
}, },
{ {
"name": "negative", "name": "negative",
"type": "CONDITIONING", "type": "CONDITIONING",
"link": 86 "link": 111
}, },
{ {
"name": "start_img", "name": "start_img",
"type": "IMAGE", "type": "IMAGE",
"link": 87 "link": 112,
"shape": 7
}, },
{ {
"name": "end_img", "name": "end_img",
"type": "IMAGE", "type": "IMAGE",
"link": null "link": null,
"shape": 7
}, },
{ {
"name": "opt_empty_latent", "name": "context_options",
"type": "LATENT", "type": "COGCONTEXT",
"link": null "link": null,
"shape": 7
},
{
"name": "tora_trajectory",
"type": "TORAFEATURES",
"link": null,
"shape": 7
},
{
"name": "fastercache",
"type": "FASTERCACHEARGS",
"link": null,
"shape": 7
},
{
"name": "vid2vid_images",
"type": "IMAGE",
"link": null,
"shape": 7
} }
], ],
"outputs": [ "outputs": [
@ -445,18 +446,15 @@
"name": "cogvideo_pipe", "name": "cogvideo_pipe",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"links": [ "links": [
89 113
], ]
"slot_index": 0,
"shape": 3
}, },
{ {
"name": "samples", "name": "samples",
"type": "LATENT", "type": "LATENT",
"links": [ "links": [
88 114
], ]
"shape": 3
} }
], ],
"properties": { "properties": {
@ -464,12 +462,66 @@
}, },
"widgets_values": [ "widgets_values": [
49, 49,
512, 720,
44, 480,
"fixed", 43,
30, "randomize",
50,
6, 6,
"CogVideoXDPMScheduler" "DDIM",
0.0563,
1
]
},
{
"id": 48,
"type": "DownloadAndLoadCogVideoGGUFModel",
"pos": {
"0": 585,
"1": 34
},
"size": {
"0": 378,
"1": 198
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "pab_config",
"type": "PAB_CONFIG",
"link": null,
"shape": 7
},
{
"name": "block_edit",
"type": "TRANSFORMERBLOCKS",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
109
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
},
"widgets_values": [
"CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors",
"bf16",
false,
"offload_device",
false,
"disabled"
] ]
} }
], ],
@ -482,14 +534,6 @@
0, 0,
"CLIP" "CLIP"
], ],
[
56,
20,
0,
31,
0,
"CLIP"
],
[ [
71, 71,
36, 36,
@ -498,46 +542,6 @@
0, 0,
"IMAGE" "IMAGE"
], ],
[
85,
30,
0,
41,
1,
"CONDITIONING"
],
[
86,
31,
0,
41,
2,
"CONDITIONING"
],
[
87,
37,
0,
41,
3,
"IMAGE"
],
[
88,
41,
1,
11,
1,
"LATENT"
],
[
89,
41,
0,
11,
0,
"COGVIDEOPIPE"
],
[ [
97, 97,
11, 11,
@ -547,22 +551,70 @@
"IMAGE" "IMAGE"
], ],
[ [
101, 108,
30,
1,
31,
0,
"CLIP"
],
[
109,
48, 48,
0, 0,
41, 51,
0, 0,
"COGVIDEOPIPE" "COGVIDEOPIPE"
],
[
110,
30,
0,
51,
1,
"CONDITIONING"
],
[
111,
31,
0,
51,
2,
"CONDITIONING"
],
[
112,
37,
0,
51,
3,
"IMAGE"
],
[
113,
51,
0,
11,
0,
"COGVIDEOPIPE"
],
[
114,
51,
1,
11,
1,
"LATENT"
] ]
], ],
"groups": [], "groups": [],
"config": {}, "config": {},
"extra": { "extra": {
"ds": { "ds": {
"scale": 0.7627768444385654, "scale": 0.7513148009015784,
"offset": [ "offset": [
62.58315607223924, 724.7448506313632,
102.05205752424705 128.336592104936
] ]
} }
}, },

View File

@ -1,6 +1,6 @@
{ {
"last_node_id": 45, "last_node_id": 47,
"last_link_id": 97, "last_link_id": 110,
"nodes": [ "nodes": [
{ {
"id": 20, "id": 20,
@ -22,8 +22,7 @@
"name": "CLIP", "name": "CLIP",
"type": "CLIP", "type": "CLIP",
"links": [ "links": [
54, 54
56
], ],
"slot_index": 0, "slot_index": 0,
"shape": 3 "shape": 3
@ -37,85 +36,6 @@
"sd3" "sd3"
] ]
}, },
{
"id": 37,
"type": "ImageResizeKJ",
"pos": {
"0": 824,
"1": 715
},
"size": {
"0": 315,
"1": 266
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 71
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
87
],
"slot_index": 0,
"shape": 3
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"nearest-exact",
false,
2,
0,
0,
"disabled"
]
},
{ {
"id": 11, "id": 11,
"type": "CogVideoDecode", "type": "CogVideoDecode",
@ -134,12 +54,12 @@
{ {
"name": "pipeline", "name": "pipeline",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"link": 89 "link": 108
}, },
{ {
"name": "samples", "name": "samples",
"type": "LATENT", "type": "LATENT",
"link": 88 "link": 109
} }
], ],
"outputs": [ "outputs": [
@ -165,43 +85,6 @@
true true
] ]
}, },
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": {
"0": 642,
"1": 90
},
"size": {
"0": 337.8885192871094,
"1": 154
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
84
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"kijai/CogVideoX-Fun-5b",
"bf16",
"disabled",
"disabled",
false
]
},
{ {
"id": 31, "id": 31,
"type": "CogVideoTextEncode", "type": "CogVideoTextEncode",
@ -211,16 +94,16 @@
}, },
"size": { "size": {
"0": 463.01251220703125, "0": 463.01251220703125,
"1": 98.10446166992188 "1": 144
}, },
"flags": {}, "flags": {},
"order": 4, "order": 5,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "clip", "name": "clip",
"type": "CLIP", "type": "CLIP",
"link": 56 "link": 110
} }
], ],
"outputs": [ "outputs": [
@ -228,17 +111,24 @@
"name": "conditioning", "name": "conditioning",
"type": "CONDITIONING", "type": "CONDITIONING",
"links": [ "links": [
86 106
], ],
"slot_index": 0, "slot_index": 0,
"shape": 3 "shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": null
} }
], ],
"properties": { "properties": {
"Node name for S&R": "CogVideoTextEncode" "Node name for S&R": "CogVideoTextEncode"
}, },
"widgets_values": [ "widgets_values": [
"The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. " "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
1,
true
] ]
}, },
{ {
@ -249,8 +139,8 @@
"1": 345 "1": 345
}, },
"size": [ "size": [
605.3909898931465, 605.3909912109375,
724.5306772953109 714.2606608072917
], ],
"flags": {}, "flags": {},
"order": 8, "order": 8,
@ -264,17 +154,20 @@
{ {
"name": "audio", "name": "audio",
"type": "AUDIO", "type": "AUDIO",
"link": null "link": null,
"shape": 7
}, },
{ {
"name": "meta_batch", "name": "meta_batch",
"type": "VHS_BatchManager", "type": "VHS_BatchManager",
"link": null "link": null,
"shape": 7
}, },
{ {
"name": "vae", "name": "vae",
"type": "VAE", "type": "VAE",
"link": null "link": null,
"shape": 7
} }
], ],
"outputs": [ "outputs": [
@ -302,7 +195,7 @@
"hidden": false, "hidden": false,
"paused": false, "paused": false,
"params": { "params": {
"filename": "CogVideoX_Fun_00003.mp4", "filename": "CogVideoX_Fun_00001.mp4",
"subfolder": "", "subfolder": "",
"type": "temp", "type": "temp",
"format": "video/h264-mp4", "format": "video/h264-mp4",
@ -313,15 +206,191 @@
} }
}, },
{ {
"id": 41, "id": 36,
"type": "CogVideoXFunSampler", "type": "LoadImage",
"pos": { "pos": {
"0": 1058, "0": 325,
"1": 345 "1": 715
},
"size": {
"0": 432.4361877441406,
"1": 361.0254211425781
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
71
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
"image"
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": {
"0": 602,
"1": 53
},
"size": {
"0": 337.8885192871094,
"1": 194
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "pab_config",
"type": "PAB_CONFIG",
"link": null,
"shape": 7
},
{
"name": "block_edit",
"type": "TRANSFORMERBLOCKS",
"link": null,
"shape": 7
},
{
"name": "lora",
"type": "COGLORA",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
104
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"kijai/CogVideoX-Fun-5b",
"bf16",
"disabled",
"disabled",
false
]
},
{
"id": 37,
"type": "ImageResizeKJ",
"pos": {
"0": 824,
"1": 715
}, },
"size": { "size": {
"0": 315, "0": 315,
"1": 282 "1": 266
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 71
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null,
"shape": 7
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
107
],
"slot_index": 0,
"shape": 3
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
2,
0,
0,
"disabled"
]
},
{
"id": 47,
"type": "CogVideoXFunSampler",
"pos": {
"0": 1068,
"1": 198
},
"size": {
"0": 367.79998779296875,
"1": 434
}, },
"flags": {}, "flags": {},
"order": 6, "order": 6,
@ -330,27 +399,53 @@
{ {
"name": "pipeline", "name": "pipeline",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"link": 84 "link": 104
}, },
{ {
"name": "positive", "name": "positive",
"type": "CONDITIONING", "type": "CONDITIONING",
"link": 85 "link": 105
}, },
{ {
"name": "negative", "name": "negative",
"type": "CONDITIONING", "type": "CONDITIONING",
"link": 86 "link": 106
}, },
{ {
"name": "start_img", "name": "start_img",
"type": "IMAGE", "type": "IMAGE",
"link": 87 "link": 107,
"shape": 7
}, },
{ {
"name": "end_img", "name": "end_img",
"type": "IMAGE", "type": "IMAGE",
"link": null "link": null,
"shape": 7
},
{
"name": "context_options",
"type": "COGCONTEXT",
"link": null,
"shape": 7
},
{
"name": "tora_trajectory",
"type": "TORAFEATURES",
"link": null,
"shape": 7
},
{
"name": "fastercache",
"type": "FASTERCACHEARGS",
"link": null,
"shape": 7
},
{
"name": "vid2vid_images",
"type": "IMAGE",
"link": null,
"shape": 7
} }
], ],
"outputs": [ "outputs": [
@ -358,18 +453,15 @@
"name": "cogvideo_pipe", "name": "cogvideo_pipe",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"links": [ "links": [
89 108
], ]
"slot_index": 0,
"shape": 3
}, },
{ {
"name": "samples", "name": "samples",
"type": "LATENT", "type": "LATENT",
"links": [ "links": [
88 109
], ]
"shape": 3
} }
], ],
"properties": { "properties": {
@ -377,12 +469,15 @@
}, },
"widgets_values": [ "widgets_values": [
49, 49,
512, 720,
480,
43, 43,
"fixed", "fixed",
30, 50,
6, 6,
"DPM++" "DDIM",
0.0563,
1
] ]
}, },
{ {
@ -411,57 +506,27 @@
"name": "conditioning", "name": "conditioning",
"type": "CONDITIONING", "type": "CONDITIONING",
"links": [ "links": [
85 105
], ],
"slot_index": 0, "slot_index": 0,
"shape": 3 "shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": [
110
],
"slot_index": 1
} }
], ],
"properties": { "properties": {
"Node name for S&R": "CogVideoTextEncode" "Node name for S&R": "CogVideoTextEncode"
}, },
"widgets_values": [ "widgets_values": [
"fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic." "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
] 1,
}, false
{
"id": 36,
"type": "LoadImage",
"pos": {
"0": 325,
"1": 715
},
"size": {
"0": 432.4361877441406,
"1": 361.0254211425781
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
71
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
"image"
] ]
} }
], ],
@ -474,14 +539,6 @@
0, 0,
"CLIP" "CLIP"
], ],
[
56,
20,
0,
31,
0,
"CLIP"
],
[ [
71, 71,
36, 36,
@ -490,54 +547,6 @@
0, 0,
"IMAGE" "IMAGE"
], ],
[
84,
1,
0,
41,
0,
"COGVIDEOPIPE"
],
[
85,
30,
0,
41,
1,
"CONDITIONING"
],
[
86,
31,
0,
41,
2,
"CONDITIONING"
],
[
87,
37,
0,
41,
3,
"IMAGE"
],
[
88,
41,
1,
11,
1,
"LATENT"
],
[
89,
41,
0,
11,
0,
"COGVIDEOPIPE"
],
[ [
97, 97,
11, 11,
@ -545,16 +554,72 @@
44, 44,
0, 0,
"IMAGE" "IMAGE"
],
[
104,
1,
0,
47,
0,
"COGVIDEOPIPE"
],
[
105,
30,
0,
47,
1,
"CONDITIONING"
],
[
106,
31,
0,
47,
2,
"CONDITIONING"
],
[
107,
37,
0,
47,
3,
"IMAGE"
],
[
108,
47,
0,
11,
0,
"COGVIDEOPIPE"
],
[
109,
47,
1,
11,
1,
"LATENT"
],
[
110,
30,
1,
31,
0,
"CLIP"
] ]
], ],
"groups": [], "groups": [],
"config": {}, "config": {},
"extra": { "extra": {
"ds": { "ds": {
"scale": 0.8264462809917361, "scale": 0.8264462809917363,
"offset": [ "offset": [
97.64239267521098, 245.90746806300405,
39.894747674006986 108.93624646284617
] ]
} }
}, },

327
nodes.py
View File

@ -101,7 +101,33 @@ class CogVideoPABConfig:
return (pab_config, ) return (pab_config, )
class CogVideoContextOptions:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
"context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
"context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
}
}
RETURN_TYPES = ("COGCONTEXT", )
RETURN_NAMES = ("context_options",)
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
context_options = {
"context_schedule":context_schedule,
"context_frames":context_frames,
"context_stride":context_stride,
"context_overlap":context_overlap,
"freenoise":freenoise
}
return (context_options,)
class CogVideoTransformerEdit: class CogVideoTransformerEdit:
@classmethod @classmethod
@ -156,6 +182,7 @@ class CogVideoLoraSelect:
print(cog_loras_list) print(cog_loras_list)
return (cog_loras_list,) return (cog_loras_list,)
#region TextEncode
class CogVideoEncodePrompt: class CogVideoEncodePrompt:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):
@ -257,8 +284,8 @@ class CogVideoTextEncode:
} }
} }
RETURN_TYPES = ("CONDITIONING",) RETURN_TYPES = ("CONDITIONING", "CLIP",)
RETURN_NAMES = ("conditioning",) RETURN_NAMES = ("conditioning", "clip")
FUNCTION = "process" FUNCTION = "process"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
@ -279,7 +306,7 @@ class CogVideoTextEncode:
if force_offload: if force_offload:
clip.cond_stage_model.to(offload_device) clip.cond_stage_model.to(offload_device)
return (embeds, ) return (embeds, clip, )
class CogVideoTextEncodeCombine: class CogVideoTextEncodeCombine:
@classmethod @classmethod
@ -312,6 +339,7 @@ class CogVideoTextEncodeCombine:
return (embeds, ) return (embeds, )
#region ImageEncode
class CogVideoImageEncode: class CogVideoImageEncode:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):
@ -474,6 +502,7 @@ class CogVideoImageInterpolationEncode:
return ({"samples": final_latents}, ) return ({"samples": final_latents}, )
#region Tora
from .tora.traj_utils import process_traj, scale_traj_list_to_256 from .tora.traj_utils import process_traj, scale_traj_list_to_256
from torchvision.utils import flow_to_image from torchvision.utils import flow_to_image
@ -631,7 +660,93 @@ class ToraEncodeOpticalFlow:
return (tora, ) return (tora, )
def add_noise_to_reference_video(image, ratio=None):
if ratio is None:
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
sigma = torch.exp(sigma).to(image.dtype)
else:
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
image = image + image_noise
return image
class CogVideoControlImageEncode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"pipeline": ("COGVIDEOPIPE",),
"control_video": ("IMAGE", ),
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
},
}
RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
RETURN_NAMES = ("control_latents", "width", "height")
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = control_video.shape
vae = pipeline["pipe"].vae
vae.enable_slicing()
if enable_tiling:
from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
enable_vae_encode_tiling(vae)
if not pipeline["cpu_offloading"]:
vae.to(device)
# Count most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
original_width, original_height = Image.fromarray(control_video[0]).size
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
log.info(f"Closest bucket size: {width}x{height}")
video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width)
control_video = control_video.to(dtype=torch.float32)
control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
masked_image = control_video.to(device=device, dtype=vae.dtype)
if noise_aug_strength > 0:
masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
bs = 1
new_mask_pixel_values = []
for i in range(0, masked_image.shape[0], bs):
mask_pixel_values_bs = masked_image[i : i + bs]
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
mask_pixel_values_bs = mask_pixel_values_bs.mode()
new_mask_pixel_values.append(mask_pixel_values_bs)
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
masked_image_latents = masked_image_latents * vae.config.scaling_factor
vae.to(offload_device)
control_latents = {
"latents": masked_image_latents,
"num_frames" : B,
"height" : height,
"width" : width,
}
return (control_latents, width, height)
#region FasterCache
class CogVideoXFasterCache: class CogVideoXFasterCache:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):
@ -660,6 +775,7 @@ class CogVideoXFasterCache:
} }
return (fastercache,) return (fastercache,)
#region Sampler
class CogVideoSampler: class CogVideoSampler:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):
@ -783,6 +899,42 @@ class CogVideoSampler:
return (pipeline, {"samples": latents}) return (pipeline, {"samples": latents})
class CogVideoControlNet:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"controlnet": ("COGVIDECONTROLNETMODEL",),
"images": ("IMAGE", ),
"control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
"control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
"control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
},
}
RETURN_TYPES = ("COGVIDECONTROLNET",)
RETURN_NAMES = ("cogvideo_controlnet",)
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = images.shape
control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
controlnet = {
"control_model": controlnet,
"control_frames": control_frames,
"control_weights": control_strength,
"control_start": control_start_percent,
"control_end": control_end_percent,
}
return (controlnet,)
#region VideoDecode
class CogVideoDecode: class CogVideoDecode:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):
@ -879,6 +1031,7 @@ class CogVideoXFunResizeToClosestBucket:
return (resized_images, width, height) return (resized_images, width, height)
#region FunSamplers
class CogVideoXFunSampler: class CogVideoXFunSampler:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):
@ -888,7 +1041,8 @@ class CogVideoXFunSampler:
"positive": ("CONDITIONING", ), "positive": ("CONDITIONING", ),
"negative": ("CONDITIONING", ), "negative": ("CONDITIONING", ),
"video_length": ("INT", {"default": 49, "min": 5, "max": 2048, "step": 4}), "video_length": ("INT", {"default": 49, "min": 5, "max": 2048, "step": 4}),
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}), "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"seed": ("INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}), "seed": ("INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}),
"steps": ("INT", {"default": 50, "min": 1, "max": 200, "step": 1}), "steps": ("INT", {"default": 50, "min": 1, "max": 200, "step": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}), "cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}),
@ -897,7 +1051,6 @@ class CogVideoXFunSampler:
"optional":{ "optional":{
"start_img": ("IMAGE",), "start_img": ("IMAGE",),
"end_img": ("IMAGE",), "end_img": ("IMAGE",),
"opt_empty_latent": ("LATENT",),
"noise_aug_strength": ("FLOAT", {"default": 0.0563, "min": 0.0, "max": 1.0, "step": 0.001}), "noise_aug_strength": ("FLOAT", {"default": 0.0563, "min": 0.0, "max": 1.0, "step": 0.001}),
"context_options": ("COGCONTEXT", ), "context_options": ("COGCONTEXT", ),
"tora_trajectory": ("TORAFEATURES", ), "tora_trajectory": ("TORAFEATURES", ),
@ -912,8 +1065,8 @@ class CogVideoXFunSampler:
FUNCTION = "process" FUNCTION = "process"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler, def process(self, pipeline, positive, negative, video_length, width, height, seed, steps, cfg, scheduler,
start_img=None, end_img=None, opt_empty_latent=None, noise_aug_strength=0.0563, context_options=None, fastercache=None, start_img=None, end_img=None, noise_aug_strength=0.0563, context_options=None, fastercache=None,
tora_trajectory=None, vid2vid_images=None, vid2vid_denoise=1.0): tora_trajectory=None, vid2vid_images=None, vid2vid_denoise=1.0):
device = mm.get_torch_device() device = mm.get_torch_device()
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
@ -929,23 +1082,13 @@ class CogVideoXFunSampler:
mm.soft_empty_cache() mm.soft_empty_cache()
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
#vid2vid #vid2vid
if vid2vid_images is not None: if vid2vid_images is not None:
validation_video = np.array(vid2vid_images.cpu().numpy() * 255, np.uint8) validation_video = np.array(vid2vid_images.cpu().numpy() * 255, np.uint8)
original_width, original_height = Image.fromarray(validation_video[0]).size
#img2vid #img2vid
elif start_img is not None: elif start_img is not None:
start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
# Count most suitable height and width
original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
else:
original_width = opt_empty_latent["samples"][0].shape[-1] * 8
original_height = opt_empty_latent["samples"][0].shape[-2] * 8
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
log.info(f"Closest bucket size: {width}x{height}")
# Load Sampler # Load Sampler
if context_options is not None and context_options["context_schedule"] == "temporal_tiling": if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
@ -1046,156 +1189,6 @@ class CogVideoXFunVid2VidSampler:
def process(self): def process(self):
return () return ()
def add_noise_to_reference_video(image, ratio=None):
if ratio is None:
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
sigma = torch.exp(sigma).to(image.dtype)
else:
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
image = image + image_noise
return image
class CogVideoControlImageEncode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"pipeline": ("COGVIDEOPIPE",),
"control_video": ("IMAGE", ),
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
},
}
RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
RETURN_NAMES = ("control_latents", "width", "height")
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = control_video.shape
vae = pipeline["pipe"].vae
vae.enable_slicing()
if enable_tiling:
from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
enable_vae_encode_tiling(vae)
if not pipeline["cpu_offloading"]:
vae.to(device)
# Count most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
original_width, original_height = Image.fromarray(control_video[0]).size
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
log.info(f"Closest bucket size: {width}x{height}")
video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width)
control_video = control_video.to(dtype=torch.float32)
control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
masked_image = control_video.to(device=device, dtype=vae.dtype)
if noise_aug_strength > 0:
masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
bs = 1
new_mask_pixel_values = []
for i in range(0, masked_image.shape[0], bs):
mask_pixel_values_bs = masked_image[i : i + bs]
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
mask_pixel_values_bs = mask_pixel_values_bs.mode()
new_mask_pixel_values.append(mask_pixel_values_bs)
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
masked_image_latents = masked_image_latents * vae.config.scaling_factor
vae.to(offload_device)
control_latents = {
"latents": masked_image_latents,
"num_frames" : B,
"height" : height,
"width" : width,
}
return (control_latents, width, height)
class CogVideoControlNet:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"controlnet": ("COGVIDECONTROLNETMODEL",),
"images": ("IMAGE", ),
"control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
"control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
"control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
},
}
RETURN_TYPES = ("COGVIDECONTROLNET",)
RETURN_NAMES = ("cogvideo_controlnet",)
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = images.shape
control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
controlnet = {
"control_model": controlnet,
"control_frames": control_frames,
"control_weights": control_strength,
"control_start": control_start_percent,
"control_end": control_end_percent,
}
return (controlnet,)
class CogVideoContextOptions:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
"context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
"context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
}
}
RETURN_TYPES = ("COGCONTEXT", )
RETURN_NAMES = ("context_options",)
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
context_options = {
"context_schedule":context_schedule,
"context_frames":context_frames,
"context_stride":context_stride,
"context_overlap":context_overlap,
"freenoise":freenoise
}
return (context_options,)
class CogVideoXFunControlSampler: class CogVideoXFunControlSampler:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):