Refactor Fun sampler to be easier to use with Tora (breaks old workflows!)

The FunSampler node in old workflows needs to be remade. I moved the forced bucket resize to it's own node if anyone still wants to use that.
This commit is contained in:
kijai 2024-11-07 13:01:34 +02:00
parent 666f7832f9
commit 9202921920
4 changed files with 2036 additions and 611 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
{
"last_node_id": 48,
"last_link_id": 101,
"last_node_id": 51,
"last_link_id": 114,
"nodes": [
{
"id": 20,
@ -22,8 +22,7 @@
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
54
],
"slot_index": 0,
"shape": 3
@ -46,16 +45,16 @@
},
"size": {
"0": 463.01251220703125,
"1": 124
"1": 144
},
"flags": {},
"order": 4,
"order": 5,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
"link": 108
}
],
"outputs": [
@ -63,10 +62,15 @@
"name": "conditioning",
"type": "CONDITIONING",
"links": [
86
111
],
"slot_index": 0,
"shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": null
}
],
"properties": {
@ -87,7 +91,7 @@
},
"size": [
855.81494140625,
927.6441243489584
881.2099609375
],
"flags": {},
"order": 8,
@ -101,17 +105,20 @@
{
"name": "audio",
"type": "AUDIO",
"link": null
"link": null,
"shape": 7
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
"link": null,
"shape": 7
},
{
"name": "vae",
"type": "VAE",
"link": null
"link": null,
"shape": 7
}
],
"outputs": [
@ -139,7 +146,7 @@
"hidden": false,
"paused": false,
"params": {
"filename": "CogVideoX_Fun_00012.mp4",
"filename": "CogVideoX_Fun_00003.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
@ -149,61 +156,12 @@
}
}
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": {
"0": 1448,
"1": 345
},
"size": {
"0": 300.396484375,
"1": 198
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 89
},
{
"name": "samples",
"type": "LATENT",
"link": 88
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
97
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
},
"widgets_values": [
true,
240,
360,
0.2,
0.2,
true
]
},
{
"id": 36,
"type": "LoadImage",
"pos": {
"0": 364,
"1": 715
"0": 227,
"1": 700
},
"size": {
"0": 391.3421325683594,
@ -242,15 +200,15 @@
"id": 37,
"type": "ImageResizeKJ",
"pos": {
"0": 824,
"1": 715
"0": 688,
"1": 708
},
"size": {
"0": 315,
"1": 266
},
"flags": {},
"order": 5,
"order": 4,
"mode": 0,
"inputs": [
{
@ -261,7 +219,8 @@
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
"link": null,
"shape": 7
},
{
"name": "width_input",
@ -285,7 +244,7 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
87
112
],
"slot_index": 0,
"shape": 3
@ -317,6 +276,55 @@
"disabled"
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": {
"0": 1477,
"1": 344
},
"size": {
"0": 300.396484375,
"1": 198
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 113
},
{
"name": "samples",
"type": "LATENT",
"link": 114
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
97
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
},
"widgets_values": [
true,
240,
360,
0.2,
0.2,
true
]
},
{
"id": 30,
"type": "CogVideoTextEncode",
@ -343,10 +351,18 @@
"name": "conditioning",
"type": "CONDITIONING",
"links": [
85
110
],
"slot_index": 0,
"shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": [
108
],
"slot_index": 1
}
],
"properties": {
@ -355,55 +371,19 @@
"widgets_values": [
"majestic stag grazing in a forest and basking in the setting sun",
1,
true
false
]
},
{
"id": 48,
"type": "DownloadAndLoadCogVideoGGUFModel",
"pos": {
"0": 584,
"1": 103
},
"size": {
"0": 378,
"1": 130
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
101
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
},
"widgets_values": [
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
"bf16",
false,
"offload_device"
]
},
{
"id": 41,
"id": 51,
"type": "CogVideoXFunSampler",
"pos": {
"0": 1058,
"1": 345
},
"size": {
"0": 315,
"1": 302
"0": 367.79998779296875,
"1": 434
},
"flags": {},
"order": 6,
@ -412,32 +392,53 @@
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 101
"link": 109
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 85
"link": 110
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 86
"link": 111
},
{
"name": "start_img",
"type": "IMAGE",
"link": 87
"link": 112,
"shape": 7
},
{
"name": "end_img",
"type": "IMAGE",
"link": null
"link": null,
"shape": 7
},
{
"name": "opt_empty_latent",
"type": "LATENT",
"link": null
"name": "context_options",
"type": "COGCONTEXT",
"link": null,
"shape": 7
},
{
"name": "tora_trajectory",
"type": "TORAFEATURES",
"link": null,
"shape": 7
},
{
"name": "fastercache",
"type": "FASTERCACHEARGS",
"link": null,
"shape": 7
},
{
"name": "vid2vid_images",
"type": "IMAGE",
"link": null,
"shape": 7
}
],
"outputs": [
@ -445,18 +446,15 @@
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
89
],
"slot_index": 0,
"shape": 3
113
]
},
{
"name": "samples",
"type": "LATENT",
"links": [
88
],
"shape": 3
114
]
}
],
"properties": {
@ -464,12 +462,66 @@
},
"widgets_values": [
49,
512,
44,
"fixed",
30,
720,
480,
43,
"randomize",
50,
6,
"CogVideoXDPMScheduler"
"DDIM",
0.0563,
1
]
},
{
"id": 48,
"type": "DownloadAndLoadCogVideoGGUFModel",
"pos": {
"0": 585,
"1": 34
},
"size": {
"0": 378,
"1": 198
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "pab_config",
"type": "PAB_CONFIG",
"link": null,
"shape": 7
},
{
"name": "block_edit",
"type": "TRANSFORMERBLOCKS",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
109
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
},
"widgets_values": [
"CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors",
"bf16",
false,
"offload_device",
false,
"disabled"
]
}
],
@ -482,14 +534,6 @@
0,
"CLIP"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
71,
36,
@ -498,46 +542,6 @@
0,
"IMAGE"
],
[
85,
30,
0,
41,
1,
"CONDITIONING"
],
[
86,
31,
0,
41,
2,
"CONDITIONING"
],
[
87,
37,
0,
41,
3,
"IMAGE"
],
[
88,
41,
1,
11,
1,
"LATENT"
],
[
89,
41,
0,
11,
0,
"COGVIDEOPIPE"
],
[
97,
11,
@ -547,22 +551,70 @@
"IMAGE"
],
[
101,
108,
30,
1,
31,
0,
"CLIP"
],
[
109,
48,
0,
41,
51,
0,
"COGVIDEOPIPE"
],
[
110,
30,
0,
51,
1,
"CONDITIONING"
],
[
111,
31,
0,
51,
2,
"CONDITIONING"
],
[
112,
37,
0,
51,
3,
"IMAGE"
],
[
113,
51,
0,
11,
0,
"COGVIDEOPIPE"
],
[
114,
51,
1,
11,
1,
"LATENT"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.7627768444385654,
"scale": 0.7513148009015784,
"offset": [
62.58315607223924,
102.05205752424705
724.7448506313632,
128.336592104936
]
}
},

View File

@ -1,6 +1,6 @@
{
"last_node_id": 45,
"last_link_id": 97,
"last_node_id": 47,
"last_link_id": 110,
"nodes": [
{
"id": 20,
@ -22,8 +22,7 @@
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
54
],
"slot_index": 0,
"shape": 3
@ -37,85 +36,6 @@
"sd3"
]
},
{
"id": 37,
"type": "ImageResizeKJ",
"pos": {
"0": 824,
"1": 715
},
"size": {
"0": 315,
"1": 266
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 71
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
87
],
"slot_index": 0,
"shape": 3
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"nearest-exact",
false,
2,
0,
0,
"disabled"
]
},
{
"id": 11,
"type": "CogVideoDecode",
@ -134,12 +54,12 @@
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 89
"link": 108
},
{
"name": "samples",
"type": "LATENT",
"link": 88
"link": 109
}
],
"outputs": [
@ -165,43 +85,6 @@
true
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": {
"0": 642,
"1": 90
},
"size": {
"0": 337.8885192871094,
"1": 154
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
84
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"kijai/CogVideoX-Fun-5b",
"bf16",
"disabled",
"disabled",
false
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
@ -211,16 +94,16 @@
},
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
"1": 144
},
"flags": {},
"order": 4,
"order": 5,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
"link": 110
}
],
"outputs": [
@ -228,17 +111,24 @@
"name": "conditioning",
"type": "CONDITIONING",
"links": [
86
106
],
"slot_index": 0,
"shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": null
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. "
"The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
1,
true
]
},
{
@ -249,8 +139,8 @@
"1": 345
},
"size": [
605.3909898931465,
724.5306772953109
605.3909912109375,
714.2606608072917
],
"flags": {},
"order": 8,
@ -264,17 +154,20 @@
{
"name": "audio",
"type": "AUDIO",
"link": null
"link": null,
"shape": 7
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
"link": null,
"shape": 7
},
{
"name": "vae",
"type": "VAE",
"link": null
"link": null,
"shape": 7
}
],
"outputs": [
@ -302,7 +195,7 @@
"hidden": false,
"paused": false,
"params": {
"filename": "CogVideoX_Fun_00003.mp4",
"filename": "CogVideoX_Fun_00001.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
@ -313,15 +206,191 @@
}
},
{
"id": 41,
"type": "CogVideoXFunSampler",
"id": 36,
"type": "LoadImage",
"pos": {
"0": 1058,
"1": 345
"0": 325,
"1": 715
},
"size": {
"0": 432.4361877441406,
"1": 361.0254211425781
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
71
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
"image"
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": {
"0": 602,
"1": 53
},
"size": {
"0": 337.8885192871094,
"1": 194
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "pab_config",
"type": "PAB_CONFIG",
"link": null,
"shape": 7
},
{
"name": "block_edit",
"type": "TRANSFORMERBLOCKS",
"link": null,
"shape": 7
},
{
"name": "lora",
"type": "COGLORA",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
104
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"kijai/CogVideoX-Fun-5b",
"bf16",
"disabled",
"disabled",
false
]
},
{
"id": 37,
"type": "ImageResizeKJ",
"pos": {
"0": 824,
"1": 715
},
"size": {
"0": 315,
"1": 282
"1": 266
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 71
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null,
"shape": 7
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
107
],
"slot_index": 0,
"shape": 3
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
2,
0,
0,
"disabled"
]
},
{
"id": 47,
"type": "CogVideoXFunSampler",
"pos": {
"0": 1068,
"1": 198
},
"size": {
"0": 367.79998779296875,
"1": 434
},
"flags": {},
"order": 6,
@ -330,27 +399,53 @@
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 84
"link": 104
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 85
"link": 105
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 86
"link": 106
},
{
"name": "start_img",
"type": "IMAGE",
"link": 87
"link": 107,
"shape": 7
},
{
"name": "end_img",
"type": "IMAGE",
"link": null
"link": null,
"shape": 7
},
{
"name": "context_options",
"type": "COGCONTEXT",
"link": null,
"shape": 7
},
{
"name": "tora_trajectory",
"type": "TORAFEATURES",
"link": null,
"shape": 7
},
{
"name": "fastercache",
"type": "FASTERCACHEARGS",
"link": null,
"shape": 7
},
{
"name": "vid2vid_images",
"type": "IMAGE",
"link": null,
"shape": 7
}
],
"outputs": [
@ -358,18 +453,15 @@
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
89
],
"slot_index": 0,
"shape": 3
108
]
},
{
"name": "samples",
"type": "LATENT",
"links": [
88
],
"shape": 3
109
]
}
],
"properties": {
@ -377,12 +469,15 @@
},
"widgets_values": [
49,
512,
720,
480,
43,
"fixed",
30,
50,
6,
"DPM++"
"DDIM",
0.0563,
1
]
},
{
@ -411,57 +506,27 @@
"name": "conditioning",
"type": "CONDITIONING",
"links": [
85
105
],
"slot_index": 0,
"shape": 3
},
{
"name": "clip",
"type": "CLIP",
"links": [
110
],
"slot_index": 1
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic."
]
},
{
"id": 36,
"type": "LoadImage",
"pos": {
"0": 325,
"1": 715
},
"size": {
"0": 432.4361877441406,
"1": 361.0254211425781
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
71
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
"image"
"fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
1,
false
]
}
],
@ -474,14 +539,6 @@
0,
"CLIP"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
71,
36,
@ -490,54 +547,6 @@
0,
"IMAGE"
],
[
84,
1,
0,
41,
0,
"COGVIDEOPIPE"
],
[
85,
30,
0,
41,
1,
"CONDITIONING"
],
[
86,
31,
0,
41,
2,
"CONDITIONING"
],
[
87,
37,
0,
41,
3,
"IMAGE"
],
[
88,
41,
1,
11,
1,
"LATENT"
],
[
89,
41,
0,
11,
0,
"COGVIDEOPIPE"
],
[
97,
11,
@ -545,16 +554,72 @@
44,
0,
"IMAGE"
],
[
104,
1,
0,
47,
0,
"COGVIDEOPIPE"
],
[
105,
30,
0,
47,
1,
"CONDITIONING"
],
[
106,
31,
0,
47,
2,
"CONDITIONING"
],
[
107,
37,
0,
47,
3,
"IMAGE"
],
[
108,
47,
0,
11,
0,
"COGVIDEOPIPE"
],
[
109,
47,
1,
11,
1,
"LATENT"
],
[
110,
30,
1,
31,
0,
"CLIP"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.8264462809917361,
"scale": 0.8264462809917363,
"offset": [
97.64239267521098,
39.894747674006986
245.90746806300405,
108.93624646284617
]
}
},

341
nodes.py
View File

@ -101,7 +101,33 @@ class CogVideoPABConfig:
return (pab_config, )
class CogVideoContextOptions:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
"context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
"context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
}
}
RETURN_TYPES = ("COGCONTEXT", )
RETURN_NAMES = ("context_options",)
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
context_options = {
"context_schedule":context_schedule,
"context_frames":context_frames,
"context_stride":context_stride,
"context_overlap":context_overlap,
"freenoise":freenoise
}
return (context_options,)
class CogVideoTransformerEdit:
@classmethod
@ -155,7 +181,8 @@ class CogVideoLoraSelect:
cog_loras_list.append(cog_lora)
print(cog_loras_list)
return (cog_loras_list,)
#region TextEncode
class CogVideoEncodePrompt:
@classmethod
def INPUT_TYPES(s):
@ -257,8 +284,8 @@ class CogVideoTextEncode:
}
}
RETURN_TYPES = ("CONDITIONING",)
RETURN_NAMES = ("conditioning",)
RETURN_TYPES = ("CONDITIONING", "CLIP",)
RETURN_NAMES = ("conditioning", "clip")
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
@ -279,7 +306,7 @@ class CogVideoTextEncode:
if force_offload:
clip.cond_stage_model.to(offload_device)
return (embeds, )
return (embeds, clip, )
class CogVideoTextEncodeCombine:
@classmethod
@ -311,7 +338,8 @@ class CogVideoTextEncodeCombine:
raise ValueError("Invalid combination mode")
return (embeds, )
#region ImageEncode
class CogVideoImageEncode:
@classmethod
def INPUT_TYPES(s):
@ -473,7 +501,8 @@ class CogVideoImageInterpolationEncode:
vae.to(offload_device)
return ({"samples": final_latents}, )
#region Tora
from .tora.traj_utils import process_traj, scale_traj_list_to_256
from torchvision.utils import flow_to_image
@ -630,8 +659,94 @@ class ToraEncodeOpticalFlow:
}
return (tora, )
def add_noise_to_reference_video(image, ratio=None):
if ratio is None:
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
sigma = torch.exp(sigma).to(image.dtype)
else:
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
image = image + image_noise
return image
class CogVideoControlImageEncode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"pipeline": ("COGVIDEOPIPE",),
"control_video": ("IMAGE", ),
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
},
}
RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
RETURN_NAMES = ("control_latents", "width", "height")
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = control_video.shape
vae = pipeline["pipe"].vae
vae.enable_slicing()
if enable_tiling:
from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
enable_vae_encode_tiling(vae)
if not pipeline["cpu_offloading"]:
vae.to(device)
# Count most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
original_width, original_height = Image.fromarray(control_video[0]).size
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
log.info(f"Closest bucket size: {width}x{height}")
video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width)
control_video = control_video.to(dtype=torch.float32)
control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
masked_image = control_video.to(device=device, dtype=vae.dtype)
if noise_aug_strength > 0:
masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
bs = 1
new_mask_pixel_values = []
for i in range(0, masked_image.shape[0], bs):
mask_pixel_values_bs = masked_image[i : i + bs]
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
mask_pixel_values_bs = mask_pixel_values_bs.mode()
new_mask_pixel_values.append(mask_pixel_values_bs)
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
masked_image_latents = masked_image_latents * vae.config.scaling_factor
vae.to(offload_device)
control_latents = {
"latents": masked_image_latents,
"num_frames" : B,
"height" : height,
"width" : width,
}
return (control_latents, width, height)
#region FasterCache
class CogVideoXFasterCache:
@classmethod
def INPUT_TYPES(s):
@ -659,7 +774,8 @@ class CogVideoXFasterCache:
"cache_device" : device if cache_device == "main_device" else offload_device
}
return (fastercache,)
#region Sampler
class CogVideoSampler:
@classmethod
def INPUT_TYPES(s):
@ -782,7 +898,43 @@ class CogVideoSampler:
mm.soft_empty_cache()
return (pipeline, {"samples": latents})
class CogVideoControlNet:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"controlnet": ("COGVIDECONTROLNETMODEL",),
"images": ("IMAGE", ),
"control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
"control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
"control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
},
}
RETURN_TYPES = ("COGVIDECONTROLNET",)
RETURN_NAMES = ("cogvideo_controlnet",)
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = images.shape
control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
controlnet = {
"control_model": controlnet,
"control_frames": control_frames,
"control_weights": control_strength,
"control_start": control_start_percent,
"control_end": control_end_percent,
}
return (controlnet,)
#region VideoDecode
class CogVideoDecode:
@classmethod
def INPUT_TYPES(s):
@ -878,7 +1030,8 @@ class CogVideoXFunResizeToClosestBucket:
resized_images = resized_images.movedim(1,-1)
return (resized_images, width, height)
#region FunSamplers
class CogVideoXFunSampler:
@classmethod
def INPUT_TYPES(s):
@ -888,7 +1041,8 @@ class CogVideoXFunSampler:
"positive": ("CONDITIONING", ),
"negative": ("CONDITIONING", ),
"video_length": ("INT", {"default": 49, "min": 5, "max": 2048, "step": 4}),
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"seed": ("INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}),
"steps": ("INT", {"default": 50, "min": 1, "max": 200, "step": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}),
@ -897,7 +1051,6 @@ class CogVideoXFunSampler:
"optional":{
"start_img": ("IMAGE",),
"end_img": ("IMAGE",),
"opt_empty_latent": ("LATENT",),
"noise_aug_strength": ("FLOAT", {"default": 0.0563, "min": 0.0, "max": 1.0, "step": 0.001}),
"context_options": ("COGCONTEXT", ),
"tora_trajectory": ("TORAFEATURES", ),
@ -912,8 +1065,8 @@ class CogVideoXFunSampler:
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler,
start_img=None, end_img=None, opt_empty_latent=None, noise_aug_strength=0.0563, context_options=None, fastercache=None,
def process(self, pipeline, positive, negative, video_length, width, height, seed, steps, cfg, scheduler,
start_img=None, end_img=None, noise_aug_strength=0.0563, context_options=None, fastercache=None,
tora_trajectory=None, vid2vid_images=None, vid2vid_denoise=1.0):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
@ -929,23 +1082,13 @@ class CogVideoXFunSampler:
mm.soft_empty_cache()
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
#vid2vid
if vid2vid_images is not None:
validation_video = np.array(vid2vid_images.cpu().numpy() * 255, np.uint8)
original_width, original_height = Image.fromarray(validation_video[0]).size
#img2vid
elif start_img is not None:
start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
# Count most suitable height and width
original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
else:
original_width = opt_empty_latent["samples"][0].shape[-1] * 8
original_height = opt_empty_latent["samples"][0].shape[-2] * 8
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
log.info(f"Closest bucket size: {width}x{height}")
end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
# Load Sampler
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
@ -1045,156 +1188,6 @@ class CogVideoXFunVid2VidSampler:
DEPRECATED = True
def process(self):
return ()
def add_noise_to_reference_video(image, ratio=None):
if ratio is None:
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
sigma = torch.exp(sigma).to(image.dtype)
else:
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
image = image + image_noise
return image
class CogVideoControlImageEncode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"pipeline": ("COGVIDEOPIPE",),
"control_video": ("IMAGE", ),
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
},
}
RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
RETURN_NAMES = ("control_latents", "width", "height")
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = control_video.shape
vae = pipeline["pipe"].vae
vae.enable_slicing()
if enable_tiling:
from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
enable_vae_encode_tiling(vae)
if not pipeline["cpu_offloading"]:
vae.to(device)
# Count most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
original_width, original_height = Image.fromarray(control_video[0]).size
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
log.info(f"Closest bucket size: {width}x{height}")
video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width)
control_video = control_video.to(dtype=torch.float32)
control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
masked_image = control_video.to(device=device, dtype=vae.dtype)
if noise_aug_strength > 0:
masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
bs = 1
new_mask_pixel_values = []
for i in range(0, masked_image.shape[0], bs):
mask_pixel_values_bs = masked_image[i : i + bs]
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
mask_pixel_values_bs = mask_pixel_values_bs.mode()
new_mask_pixel_values.append(mask_pixel_values_bs)
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
masked_image_latents = masked_image_latents * vae.config.scaling_factor
vae.to(offload_device)
control_latents = {
"latents": masked_image_latents,
"num_frames" : B,
"height" : height,
"width" : width,
}
return (control_latents, width, height)
class CogVideoControlNet:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"controlnet": ("COGVIDECONTROLNETMODEL",),
"images": ("IMAGE", ),
"control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
"control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
"control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
},
}
RETURN_TYPES = ("COGVIDECONTROLNET",)
RETURN_NAMES = ("cogvideo_controlnet",)
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
B, H, W, C = images.shape
control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
controlnet = {
"control_model": controlnet,
"control_frames": control_frames,
"control_weights": control_strength,
"control_start": control_start_percent,
"control_end": control_end_percent,
}
return (controlnet,)
class CogVideoContextOptions:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
"context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
"context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
"freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
}
}
RETURN_TYPES = ("COGCONTEXT", )
RETURN_NAMES = ("context_options",)
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
context_options = {
"context_schedule":context_schedule,
"context_frames":context_frames,
"context_stride":context_stride,
"context_overlap":context_overlap,
"freenoise":freenoise
}
return (context_options,)
class CogVideoXFunControlSampler:
@classmethod