mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2026-01-23 10:24:24 +08:00
Refactor Fun sampler to be easier to use with Tora (breaks old workflows!)
The FunSampler node in old workflows needs to be remade. I moved the forced bucket resize to it's own node if anyone still wants to use that.
This commit is contained in:
parent
666f7832f9
commit
9202921920
1315
examples/cogvideox_fun_img2vid_tora_01.json
Normal file
1315
examples/cogvideox_fun_img2vid_tora_01.json
Normal file
File diff suppressed because one or more lines are too long
@ -1,6 +1,6 @@
|
||||
{
|
||||
"last_node_id": 48,
|
||||
"last_link_id": 101,
|
||||
"last_node_id": 51,
|
||||
"last_link_id": 114,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 20,
|
||||
@ -22,8 +22,7 @@
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
54,
|
||||
56
|
||||
54
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
@ -46,16 +45,16 @@
|
||||
},
|
||||
"size": {
|
||||
"0": 463.01251220703125,
|
||||
"1": 124
|
||||
"1": 144
|
||||
},
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 56
|
||||
"link": 108
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -63,10 +62,15 @@
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
86
|
||||
111
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
@ -87,7 +91,7 @@
|
||||
},
|
||||
"size": [
|
||||
855.81494140625,
|
||||
927.6441243489584
|
||||
881.2099609375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
@ -101,17 +105,20 @@
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "meta_batch",
|
||||
"type": "VHS_BatchManager",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "vae",
|
||||
"type": "VAE",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -139,7 +146,7 @@
|
||||
"hidden": false,
|
||||
"paused": false,
|
||||
"params": {
|
||||
"filename": "CogVideoX_Fun_00012.mp4",
|
||||
"filename": "CogVideoX_Fun_00003.mp4",
|
||||
"subfolder": "",
|
||||
"type": "temp",
|
||||
"format": "video/h264-mp4",
|
||||
@ -149,61 +156,12 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CogVideoDecode",
|
||||
"pos": {
|
||||
"0": 1448,
|
||||
"1": 345
|
||||
},
|
||||
"size": {
|
||||
"0": 300.396484375,
|
||||
"1": 198
|
||||
},
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 89
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": 88
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
97
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoDecode"
|
||||
},
|
||||
"widgets_values": [
|
||||
true,
|
||||
240,
|
||||
360,
|
||||
0.2,
|
||||
0.2,
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 36,
|
||||
"type": "LoadImage",
|
||||
"pos": {
|
||||
"0": 364,
|
||||
"1": 715
|
||||
"0": 227,
|
||||
"1": 700
|
||||
},
|
||||
"size": {
|
||||
"0": 391.3421325683594,
|
||||
@ -242,15 +200,15 @@
|
||||
"id": 37,
|
||||
"type": "ImageResizeKJ",
|
||||
"pos": {
|
||||
"0": 824,
|
||||
"1": 715
|
||||
"0": 688,
|
||||
"1": 708
|
||||
},
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 266
|
||||
},
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -261,7 +219,8 @@
|
||||
{
|
||||
"name": "get_image_size",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "width_input",
|
||||
@ -285,7 +244,7 @@
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
87
|
||||
112
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
@ -317,6 +276,55 @@
|
||||
"disabled"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CogVideoDecode",
|
||||
"pos": {
|
||||
"0": 1477,
|
||||
"1": 344
|
||||
},
|
||||
"size": {
|
||||
"0": 300.396484375,
|
||||
"1": 198
|
||||
},
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 113
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": 114
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
97
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoDecode"
|
||||
},
|
||||
"widgets_values": [
|
||||
true,
|
||||
240,
|
||||
360,
|
||||
0.2,
|
||||
0.2,
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "CogVideoTextEncode",
|
||||
@ -343,10 +351,18 @@
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
85
|
||||
110
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
108
|
||||
],
|
||||
"slot_index": 1
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
@ -355,55 +371,19 @@
|
||||
"widgets_values": [
|
||||
"majestic stag grazing in a forest and basking in the setting sun",
|
||||
1,
|
||||
true
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 48,
|
||||
"type": "DownloadAndLoadCogVideoGGUFModel",
|
||||
"pos": {
|
||||
"0": 584,
|
||||
"1": 103
|
||||
},
|
||||
"size": {
|
||||
"0": 378,
|
||||
"1": 130
|
||||
},
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
101
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
|
||||
},
|
||||
"widgets_values": [
|
||||
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
|
||||
"bf16",
|
||||
false,
|
||||
"offload_device"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"id": 51,
|
||||
"type": "CogVideoXFunSampler",
|
||||
"pos": {
|
||||
"0": 1058,
|
||||
"1": 345
|
||||
},
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 302
|
||||
"0": 367.79998779296875,
|
||||
"1": 434
|
||||
},
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
@ -412,32 +392,53 @@
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 101
|
||||
"link": 109
|
||||
},
|
||||
{
|
||||
"name": "positive",
|
||||
"type": "CONDITIONING",
|
||||
"link": 85
|
||||
"link": 110
|
||||
},
|
||||
{
|
||||
"name": "negative",
|
||||
"type": "CONDITIONING",
|
||||
"link": 86
|
||||
"link": 111
|
||||
},
|
||||
{
|
||||
"name": "start_img",
|
||||
"type": "IMAGE",
|
||||
"link": 87
|
||||
"link": 112,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "end_img",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "opt_empty_latent",
|
||||
"type": "LATENT",
|
||||
"link": null
|
||||
"name": "context_options",
|
||||
"type": "COGCONTEXT",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "tora_trajectory",
|
||||
"type": "TORAFEATURES",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "fastercache",
|
||||
"type": "FASTERCACHEARGS",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "vid2vid_images",
|
||||
"type": "IMAGE",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -445,18 +446,15 @@
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
89
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
113
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"links": [
|
||||
88
|
||||
],
|
||||
"shape": 3
|
||||
114
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
@ -464,12 +462,66 @@
|
||||
},
|
||||
"widgets_values": [
|
||||
49,
|
||||
512,
|
||||
44,
|
||||
"fixed",
|
||||
30,
|
||||
720,
|
||||
480,
|
||||
43,
|
||||
"randomize",
|
||||
50,
|
||||
6,
|
||||
"CogVideoXDPMScheduler"
|
||||
"DDIM",
|
||||
0.0563,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 48,
|
||||
"type": "DownloadAndLoadCogVideoGGUFModel",
|
||||
"pos": {
|
||||
"0": 585,
|
||||
"1": 34
|
||||
},
|
||||
"size": {
|
||||
"0": 378,
|
||||
"1": 198
|
||||
},
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pab_config",
|
||||
"type": "PAB_CONFIG",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "block_edit",
|
||||
"type": "TRANSFORMERBLOCKS",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
109
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
|
||||
},
|
||||
"widgets_values": [
|
||||
"CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors",
|
||||
"bf16",
|
||||
false,
|
||||
"offload_device",
|
||||
false,
|
||||
"disabled"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -482,14 +534,6 @@
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
56,
|
||||
20,
|
||||
0,
|
||||
31,
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
71,
|
||||
36,
|
||||
@ -498,46 +542,6 @@
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
85,
|
||||
30,
|
||||
0,
|
||||
41,
|
||||
1,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
86,
|
||||
31,
|
||||
0,
|
||||
41,
|
||||
2,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
87,
|
||||
37,
|
||||
0,
|
||||
41,
|
||||
3,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
88,
|
||||
41,
|
||||
1,
|
||||
11,
|
||||
1,
|
||||
"LATENT"
|
||||
],
|
||||
[
|
||||
89,
|
||||
41,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
97,
|
||||
11,
|
||||
@ -547,22 +551,70 @@
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
101,
|
||||
108,
|
||||
30,
|
||||
1,
|
||||
31,
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
109,
|
||||
48,
|
||||
0,
|
||||
41,
|
||||
51,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
110,
|
||||
30,
|
||||
0,
|
||||
51,
|
||||
1,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
111,
|
||||
31,
|
||||
0,
|
||||
51,
|
||||
2,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
112,
|
||||
37,
|
||||
0,
|
||||
51,
|
||||
3,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
113,
|
||||
51,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
114,
|
||||
51,
|
||||
1,
|
||||
11,
|
||||
1,
|
||||
"LATENT"
|
||||
]
|
||||
],
|
||||
"groups": [],
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 0.7627768444385654,
|
||||
"scale": 0.7513148009015784,
|
||||
"offset": [
|
||||
62.58315607223924,
|
||||
102.05205752424705
|
||||
724.7448506313632,
|
||||
128.336592104936
|
||||
]
|
||||
}
|
||||
},
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"last_node_id": 45,
|
||||
"last_link_id": 97,
|
||||
"last_node_id": 47,
|
||||
"last_link_id": 110,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 20,
|
||||
@ -22,8 +22,7 @@
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
54,
|
||||
56
|
||||
54
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
@ -37,85 +36,6 @@
|
||||
"sd3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 37,
|
||||
"type": "ImageResizeKJ",
|
||||
"pos": {
|
||||
"0": 824,
|
||||
"1": 715
|
||||
},
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 266
|
||||
},
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 71
|
||||
},
|
||||
{
|
||||
"name": "get_image_size",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "width_input",
|
||||
"type": "INT",
|
||||
"link": null,
|
||||
"widget": {
|
||||
"name": "width_input"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "height_input",
|
||||
"type": "INT",
|
||||
"link": null,
|
||||
"widget": {
|
||||
"name": "height_input"
|
||||
}
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
87
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageResizeKJ"
|
||||
},
|
||||
"widgets_values": [
|
||||
720,
|
||||
480,
|
||||
"nearest-exact",
|
||||
false,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
"disabled"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CogVideoDecode",
|
||||
@ -134,12 +54,12 @@
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 89
|
||||
"link": 108
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": 88
|
||||
"link": 109
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -165,43 +85,6 @@
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"type": "DownloadAndLoadCogVideoModel",
|
||||
"pos": {
|
||||
"0": 642,
|
||||
"1": 90
|
||||
},
|
||||
"size": {
|
||||
"0": 337.8885192871094,
|
||||
"1": 154
|
||||
},
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
84
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DownloadAndLoadCogVideoModel"
|
||||
},
|
||||
"widgets_values": [
|
||||
"kijai/CogVideoX-Fun-5b",
|
||||
"bf16",
|
||||
"disabled",
|
||||
"disabled",
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "CogVideoTextEncode",
|
||||
@ -211,16 +94,16 @@
|
||||
},
|
||||
"size": {
|
||||
"0": 463.01251220703125,
|
||||
"1": 98.10446166992188
|
||||
"1": 144
|
||||
},
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 56
|
||||
"link": 110
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -228,17 +111,24 @@
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
86
|
||||
106
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoTextEncode"
|
||||
},
|
||||
"widgets_values": [
|
||||
"The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. "
|
||||
"The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
|
||||
1,
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -249,8 +139,8 @@
|
||||
"1": 345
|
||||
},
|
||||
"size": [
|
||||
605.3909898931465,
|
||||
724.5306772953109
|
||||
605.3909912109375,
|
||||
714.2606608072917
|
||||
],
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
@ -264,17 +154,20 @@
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "meta_batch",
|
||||
"type": "VHS_BatchManager",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "vae",
|
||||
"type": "VAE",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -302,7 +195,7 @@
|
||||
"hidden": false,
|
||||
"paused": false,
|
||||
"params": {
|
||||
"filename": "CogVideoX_Fun_00003.mp4",
|
||||
"filename": "CogVideoX_Fun_00001.mp4",
|
||||
"subfolder": "",
|
||||
"type": "temp",
|
||||
"format": "video/h264-mp4",
|
||||
@ -313,15 +206,191 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "CogVideoXFunSampler",
|
||||
"id": 36,
|
||||
"type": "LoadImage",
|
||||
"pos": {
|
||||
"0": 1058,
|
||||
"1": 345
|
||||
"0": 325,
|
||||
"1": 715
|
||||
},
|
||||
"size": {
|
||||
"0": 432.4361877441406,
|
||||
"1": 361.0254211425781
|
||||
},
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
71
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "MASK",
|
||||
"type": "MASK",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadImage"
|
||||
},
|
||||
"widgets_values": [
|
||||
"6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
|
||||
"image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"type": "DownloadAndLoadCogVideoModel",
|
||||
"pos": {
|
||||
"0": 602,
|
||||
"1": 53
|
||||
},
|
||||
"size": {
|
||||
"0": 337.8885192871094,
|
||||
"1": 194
|
||||
},
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pab_config",
|
||||
"type": "PAB_CONFIG",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "block_edit",
|
||||
"type": "TRANSFORMERBLOCKS",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "lora",
|
||||
"type": "COGLORA",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
104
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DownloadAndLoadCogVideoModel"
|
||||
},
|
||||
"widgets_values": [
|
||||
"kijai/CogVideoX-Fun-5b",
|
||||
"bf16",
|
||||
"disabled",
|
||||
"disabled",
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 37,
|
||||
"type": "ImageResizeKJ",
|
||||
"pos": {
|
||||
"0": 824,
|
||||
"1": 715
|
||||
},
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 282
|
||||
"1": 266
|
||||
},
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 71
|
||||
},
|
||||
{
|
||||
"name": "get_image_size",
|
||||
"type": "IMAGE",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "width_input",
|
||||
"type": "INT",
|
||||
"link": null,
|
||||
"widget": {
|
||||
"name": "width_input"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "height_input",
|
||||
"type": "INT",
|
||||
"link": null,
|
||||
"widget": {
|
||||
"name": "height_input"
|
||||
}
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
107
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageResizeKJ"
|
||||
},
|
||||
"widgets_values": [
|
||||
720,
|
||||
480,
|
||||
"lanczos",
|
||||
false,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
"disabled"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 47,
|
||||
"type": "CogVideoXFunSampler",
|
||||
"pos": {
|
||||
"0": 1068,
|
||||
"1": 198
|
||||
},
|
||||
"size": {
|
||||
"0": 367.79998779296875,
|
||||
"1": 434
|
||||
},
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
@ -330,27 +399,53 @@
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 84
|
||||
"link": 104
|
||||
},
|
||||
{
|
||||
"name": "positive",
|
||||
"type": "CONDITIONING",
|
||||
"link": 85
|
||||
"link": 105
|
||||
},
|
||||
{
|
||||
"name": "negative",
|
||||
"type": "CONDITIONING",
|
||||
"link": 86
|
||||
"link": 106
|
||||
},
|
||||
{
|
||||
"name": "start_img",
|
||||
"type": "IMAGE",
|
||||
"link": 87
|
||||
"link": 107,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "end_img",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "context_options",
|
||||
"type": "COGCONTEXT",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "tora_trajectory",
|
||||
"type": "TORAFEATURES",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "fastercache",
|
||||
"type": "FASTERCACHEARGS",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
},
|
||||
{
|
||||
"name": "vid2vid_images",
|
||||
"type": "IMAGE",
|
||||
"link": null,
|
||||
"shape": 7
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -358,18 +453,15 @@
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
89
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
108
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"links": [
|
||||
88
|
||||
],
|
||||
"shape": 3
|
||||
109
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
@ -377,12 +469,15 @@
|
||||
},
|
||||
"widgets_values": [
|
||||
49,
|
||||
512,
|
||||
720,
|
||||
480,
|
||||
43,
|
||||
"fixed",
|
||||
30,
|
||||
50,
|
||||
6,
|
||||
"DPM++"
|
||||
"DDIM",
|
||||
0.0563,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -411,57 +506,27 @@
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
85
|
||||
105
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
110
|
||||
],
|
||||
"slot_index": 1
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoTextEncode"
|
||||
},
|
||||
"widgets_values": [
|
||||
"fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic."
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 36,
|
||||
"type": "LoadImage",
|
||||
"pos": {
|
||||
"0": 325,
|
||||
"1": 715
|
||||
},
|
||||
"size": {
|
||||
"0": 432.4361877441406,
|
||||
"1": 361.0254211425781
|
||||
},
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
71
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "MASK",
|
||||
"type": "MASK",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadImage"
|
||||
},
|
||||
"widgets_values": [
|
||||
"6e1a7befce6daa63fc01cb66c1a22ed0.jpg",
|
||||
"image"
|
||||
"fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
|
||||
1,
|
||||
false
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -474,14 +539,6 @@
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
56,
|
||||
20,
|
||||
0,
|
||||
31,
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
71,
|
||||
36,
|
||||
@ -490,54 +547,6 @@
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
84,
|
||||
1,
|
||||
0,
|
||||
41,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
85,
|
||||
30,
|
||||
0,
|
||||
41,
|
||||
1,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
86,
|
||||
31,
|
||||
0,
|
||||
41,
|
||||
2,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
87,
|
||||
37,
|
||||
0,
|
||||
41,
|
||||
3,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
88,
|
||||
41,
|
||||
1,
|
||||
11,
|
||||
1,
|
||||
"LATENT"
|
||||
],
|
||||
[
|
||||
89,
|
||||
41,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
97,
|
||||
11,
|
||||
@ -545,16 +554,72 @@
|
||||
44,
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
104,
|
||||
1,
|
||||
0,
|
||||
47,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
105,
|
||||
30,
|
||||
0,
|
||||
47,
|
||||
1,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
106,
|
||||
31,
|
||||
0,
|
||||
47,
|
||||
2,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
107,
|
||||
37,
|
||||
0,
|
||||
47,
|
||||
3,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
108,
|
||||
47,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
109,
|
||||
47,
|
||||
1,
|
||||
11,
|
||||
1,
|
||||
"LATENT"
|
||||
],
|
||||
[
|
||||
110,
|
||||
30,
|
||||
1,
|
||||
31,
|
||||
0,
|
||||
"CLIP"
|
||||
]
|
||||
],
|
||||
"groups": [],
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 0.8264462809917361,
|
||||
"scale": 0.8264462809917363,
|
||||
"offset": [
|
||||
97.64239267521098,
|
||||
39.894747674006986
|
||||
245.90746806300405,
|
||||
108.93624646284617
|
||||
]
|
||||
}
|
||||
},
|
||||
341
nodes.py
341
nodes.py
@ -101,7 +101,33 @@ class CogVideoPABConfig:
|
||||
|
||||
return (pab_config, )
|
||||
|
||||
class CogVideoContextOptions:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
|
||||
"context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
|
||||
"context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
|
||||
"context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
|
||||
"freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
|
||||
}
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("COGCONTEXT", )
|
||||
RETURN_NAMES = ("context_options",)
|
||||
FUNCTION = "process"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
|
||||
context_options = {
|
||||
"context_schedule":context_schedule,
|
||||
"context_frames":context_frames,
|
||||
"context_stride":context_stride,
|
||||
"context_overlap":context_overlap,
|
||||
"freenoise":freenoise
|
||||
}
|
||||
|
||||
return (context_options,)
|
||||
|
||||
class CogVideoTransformerEdit:
|
||||
@classmethod
|
||||
@ -155,7 +181,8 @@ class CogVideoLoraSelect:
|
||||
cog_loras_list.append(cog_lora)
|
||||
print(cog_loras_list)
|
||||
return (cog_loras_list,)
|
||||
|
||||
|
||||
#region TextEncode
|
||||
class CogVideoEncodePrompt:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -257,8 +284,8 @@ class CogVideoTextEncode:
|
||||
}
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
RETURN_NAMES = ("conditioning",)
|
||||
RETURN_TYPES = ("CONDITIONING", "CLIP",)
|
||||
RETURN_NAMES = ("conditioning", "clip")
|
||||
FUNCTION = "process"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
@ -279,7 +306,7 @@ class CogVideoTextEncode:
|
||||
if force_offload:
|
||||
clip.cond_stage_model.to(offload_device)
|
||||
|
||||
return (embeds, )
|
||||
return (embeds, clip, )
|
||||
|
||||
class CogVideoTextEncodeCombine:
|
||||
@classmethod
|
||||
@ -311,7 +338,8 @@ class CogVideoTextEncodeCombine:
|
||||
raise ValueError("Invalid combination mode")
|
||||
|
||||
return (embeds, )
|
||||
|
||||
|
||||
#region ImageEncode
|
||||
class CogVideoImageEncode:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -473,7 +501,8 @@ class CogVideoImageInterpolationEncode:
|
||||
vae.to(offload_device)
|
||||
|
||||
return ({"samples": final_latents}, )
|
||||
|
||||
|
||||
#region Tora
|
||||
from .tora.traj_utils import process_traj, scale_traj_list_to_256
|
||||
from torchvision.utils import flow_to_image
|
||||
|
||||
@ -630,8 +659,94 @@ class ToraEncodeOpticalFlow:
|
||||
}
|
||||
|
||||
return (tora, )
|
||||
|
||||
|
||||
def add_noise_to_reference_video(image, ratio=None):
|
||||
if ratio is None:
|
||||
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
|
||||
sigma = torch.exp(sigma).to(image.dtype)
|
||||
else:
|
||||
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
|
||||
|
||||
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
|
||||
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
|
||||
image = image + image_noise
|
||||
return image
|
||||
|
||||
class CogVideoControlImageEncode:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"pipeline": ("COGVIDEOPIPE",),
|
||||
"control_video": ("IMAGE", ),
|
||||
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
|
||||
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
|
||||
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
|
||||
},
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
|
||||
RETURN_NAMES = ("control_latents", "width", "height")
|
||||
FUNCTION = "encode"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
|
||||
B, H, W, C = control_video.shape
|
||||
|
||||
vae = pipeline["pipe"].vae
|
||||
vae.enable_slicing()
|
||||
|
||||
if enable_tiling:
|
||||
from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
|
||||
enable_vae_encode_tiling(vae)
|
||||
|
||||
if not pipeline["cpu_offloading"]:
|
||||
vae.to(device)
|
||||
|
||||
# Count most suitable height and width
|
||||
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
||||
|
||||
control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
|
||||
original_width, original_height = Image.fromarray(control_video[0]).size
|
||||
|
||||
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
|
||||
height, width = [int(x / 16) * 16 for x in closest_size]
|
||||
log.info(f"Closest bucket size: {width}x{height}")
|
||||
|
||||
video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
|
||||
input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
|
||||
|
||||
control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width)
|
||||
control_video = control_video.to(dtype=torch.float32)
|
||||
control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
|
||||
|
||||
masked_image = control_video.to(device=device, dtype=vae.dtype)
|
||||
if noise_aug_strength > 0:
|
||||
masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
|
||||
bs = 1
|
||||
new_mask_pixel_values = []
|
||||
for i in range(0, masked_image.shape[0], bs):
|
||||
mask_pixel_values_bs = masked_image[i : i + bs]
|
||||
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
|
||||
mask_pixel_values_bs = mask_pixel_values_bs.mode()
|
||||
new_mask_pixel_values.append(mask_pixel_values_bs)
|
||||
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
|
||||
masked_image_latents = masked_image_latents * vae.config.scaling_factor
|
||||
|
||||
vae.to(offload_device)
|
||||
|
||||
control_latents = {
|
||||
"latents": masked_image_latents,
|
||||
"num_frames" : B,
|
||||
"height" : height,
|
||||
"width" : width,
|
||||
}
|
||||
|
||||
return (control_latents, width, height)
|
||||
|
||||
#region FasterCache
|
||||
class CogVideoXFasterCache:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -659,7 +774,8 @@ class CogVideoXFasterCache:
|
||||
"cache_device" : device if cache_device == "main_device" else offload_device
|
||||
}
|
||||
return (fastercache,)
|
||||
|
||||
|
||||
#region Sampler
|
||||
class CogVideoSampler:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -782,7 +898,43 @@ class CogVideoSampler:
|
||||
mm.soft_empty_cache()
|
||||
|
||||
return (pipeline, {"samples": latents})
|
||||
|
||||
class CogVideoControlNet:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"controlnet": ("COGVIDECONTROLNETMODEL",),
|
||||
"images": ("IMAGE", ),
|
||||
"control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
|
||||
"control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
"control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
},
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("COGVIDECONTROLNET",)
|
||||
RETURN_NAMES = ("cogvideo_controlnet",)
|
||||
FUNCTION = "encode"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
|
||||
B, H, W, C = images.shape
|
||||
|
||||
control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
|
||||
|
||||
controlnet = {
|
||||
"control_model": controlnet,
|
||||
"control_frames": control_frames,
|
||||
"control_weights": control_strength,
|
||||
"control_start": control_start_percent,
|
||||
"control_end": control_end_percent,
|
||||
}
|
||||
|
||||
return (controlnet,)
|
||||
|
||||
#region VideoDecode
|
||||
class CogVideoDecode:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -878,7 +1030,8 @@ class CogVideoXFunResizeToClosestBucket:
|
||||
resized_images = resized_images.movedim(1,-1)
|
||||
|
||||
return (resized_images, width, height)
|
||||
|
||||
|
||||
#region FunSamplers
|
||||
class CogVideoXFunSampler:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -888,7 +1041,8 @@ class CogVideoXFunSampler:
|
||||
"positive": ("CONDITIONING", ),
|
||||
"negative": ("CONDITIONING", ),
|
||||
"video_length": ("INT", {"default": 49, "min": 5, "max": 2048, "step": 4}),
|
||||
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
|
||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
||||
"seed": ("INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}),
|
||||
"steps": ("INT", {"default": 50, "min": 1, "max": 200, "step": 1}),
|
||||
"cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}),
|
||||
@ -897,7 +1051,6 @@ class CogVideoXFunSampler:
|
||||
"optional":{
|
||||
"start_img": ("IMAGE",),
|
||||
"end_img": ("IMAGE",),
|
||||
"opt_empty_latent": ("LATENT",),
|
||||
"noise_aug_strength": ("FLOAT", {"default": 0.0563, "min": 0.0, "max": 1.0, "step": 0.001}),
|
||||
"context_options": ("COGCONTEXT", ),
|
||||
"tora_trajectory": ("TORAFEATURES", ),
|
||||
@ -912,8 +1065,8 @@ class CogVideoXFunSampler:
|
||||
FUNCTION = "process"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler,
|
||||
start_img=None, end_img=None, opt_empty_latent=None, noise_aug_strength=0.0563, context_options=None, fastercache=None,
|
||||
def process(self, pipeline, positive, negative, video_length, width, height, seed, steps, cfg, scheduler,
|
||||
start_img=None, end_img=None, noise_aug_strength=0.0563, context_options=None, fastercache=None,
|
||||
tora_trajectory=None, vid2vid_images=None, vid2vid_denoise=1.0):
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
@ -929,23 +1082,13 @@ class CogVideoXFunSampler:
|
||||
|
||||
mm.soft_empty_cache()
|
||||
|
||||
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
||||
#vid2vid
|
||||
if vid2vid_images is not None:
|
||||
validation_video = np.array(vid2vid_images.cpu().numpy() * 255, np.uint8)
|
||||
original_width, original_height = Image.fromarray(validation_video[0]).size
|
||||
#img2vid
|
||||
elif start_img is not None:
|
||||
start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
|
||||
end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
|
||||
# Count most suitable height and width
|
||||
original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
|
||||
else:
|
||||
original_width = opt_empty_latent["samples"][0].shape[-1] * 8
|
||||
original_height = opt_empty_latent["samples"][0].shape[-2] * 8
|
||||
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
|
||||
height, width = [int(x / 16) * 16 for x in closest_size]
|
||||
log.info(f"Closest bucket size: {width}x{height}")
|
||||
end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
|
||||
|
||||
# Load Sampler
|
||||
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
|
||||
@ -1045,156 +1188,6 @@ class CogVideoXFunVid2VidSampler:
|
||||
DEPRECATED = True
|
||||
def process(self):
|
||||
return ()
|
||||
|
||||
def add_noise_to_reference_video(image, ratio=None):
|
||||
if ratio is None:
|
||||
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
|
||||
sigma = torch.exp(sigma).to(image.dtype)
|
||||
else:
|
||||
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
|
||||
|
||||
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
|
||||
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
|
||||
image = image + image_noise
|
||||
return image
|
||||
|
||||
class CogVideoControlImageEncode:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"pipeline": ("COGVIDEOPIPE",),
|
||||
"control_video": ("IMAGE", ),
|
||||
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
|
||||
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
|
||||
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
|
||||
},
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("COGCONTROL_LATENTS", "INT", "INT",)
|
||||
RETURN_NAMES = ("control_latents", "width", "height")
|
||||
FUNCTION = "encode"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def encode(self, pipeline, control_video, base_resolution, enable_tiling, noise_aug_strength=0.0563):
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
|
||||
B, H, W, C = control_video.shape
|
||||
|
||||
vae = pipeline["pipe"].vae
|
||||
vae.enable_slicing()
|
||||
|
||||
if enable_tiling:
|
||||
from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
|
||||
enable_vae_encode_tiling(vae)
|
||||
|
||||
if not pipeline["cpu_offloading"]:
|
||||
vae.to(device)
|
||||
|
||||
# Count most suitable height and width
|
||||
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
||||
|
||||
control_video = np.array(control_video.cpu().numpy() * 255, np.uint8)
|
||||
original_width, original_height = Image.fromarray(control_video[0]).size
|
||||
|
||||
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
|
||||
height, width = [int(x / 16) * 16 for x in closest_size]
|
||||
log.info(f"Closest bucket size: {width}x{height}")
|
||||
|
||||
video_length = int((B - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if B != 1 else 1
|
||||
input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width))
|
||||
|
||||
control_video = pipeline["pipe"].image_processor.preprocess(rearrange(input_video, "b c f h w -> (b f) c h w"), height=height, width=width)
|
||||
control_video = control_video.to(dtype=torch.float32)
|
||||
control_video = rearrange(control_video, "(b f) c h w -> b c f h w", f=video_length)
|
||||
|
||||
masked_image = control_video.to(device=device, dtype=vae.dtype)
|
||||
if noise_aug_strength > 0:
|
||||
masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
|
||||
bs = 1
|
||||
new_mask_pixel_values = []
|
||||
for i in range(0, masked_image.shape[0], bs):
|
||||
mask_pixel_values_bs = masked_image[i : i + bs]
|
||||
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
|
||||
mask_pixel_values_bs = mask_pixel_values_bs.mode()
|
||||
new_mask_pixel_values.append(mask_pixel_values_bs)
|
||||
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
|
||||
masked_image_latents = masked_image_latents * vae.config.scaling_factor
|
||||
|
||||
vae.to(offload_device)
|
||||
|
||||
control_latents = {
|
||||
"latents": masked_image_latents,
|
||||
"num_frames" : B,
|
||||
"height" : height,
|
||||
"width" : width,
|
||||
}
|
||||
|
||||
return (control_latents, width, height)
|
||||
|
||||
class CogVideoControlNet:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"controlnet": ("COGVIDECONTROLNETMODEL",),
|
||||
"images": ("IMAGE", ),
|
||||
"control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
|
||||
"control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
"control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
},
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("COGVIDECONTROLNET",)
|
||||
RETURN_NAMES = ("cogvideo_controlnet",)
|
||||
FUNCTION = "encode"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def encode(self, controlnet, images, control_strength, control_start_percent, control_end_percent):
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
|
||||
B, H, W, C = images.shape
|
||||
|
||||
control_frames = images.permute(0, 3, 1, 2).unsqueeze(0) * 2 - 1
|
||||
|
||||
controlnet = {
|
||||
"control_model": controlnet,
|
||||
"control_frames": control_frames,
|
||||
"control_weights": control_strength,
|
||||
"control_start": control_start_percent,
|
||||
"control_end": control_end_percent,
|
||||
}
|
||||
|
||||
return (controlnet,)
|
||||
|
||||
|
||||
class CogVideoContextOptions:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],),
|
||||
"context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ),
|
||||
"context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
|
||||
"context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ),
|
||||
"freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),
|
||||
}
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("COGCONTEXT", )
|
||||
RETURN_NAMES = ("context_options",)
|
||||
FUNCTION = "process"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def process(self, context_schedule, context_frames, context_stride, context_overlap, freenoise):
|
||||
context_options = {
|
||||
"context_schedule":context_schedule,
|
||||
"context_frames":context_frames,
|
||||
"context_stride":context_stride,
|
||||
"context_overlap":context_overlap,
|
||||
"freenoise":freenoise
|
||||
}
|
||||
|
||||
return (context_options,)
|
||||
|
||||
class CogVideoXFunControlSampler:
|
||||
@classmethod
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user