temporal tiling for longer outputs

This commit is contained in:
kijai 2024-08-07 17:46:04 +03:00
parent b602a015bb
commit bbfaee3adb
5 changed files with 1018 additions and 502 deletions

View File

@ -0,0 +1,475 @@
{
"last_node_id": 33,
"last_link_id": 60,
"nodes": [
{
"id": 30,
"type": "CogVideoTextEncode",
"pos": [
500,
308
],
"size": {
"0": 474.8450012207031,
"1": 164.7423553466797
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 54
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
55
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
]
},
{
"id": 20,
"type": "CLIPLoader",
"pos": [
-59,
397
],
"size": {
"0": 451.30548095703125,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
503,
521
],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
57
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
""
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": [
649,
182
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
36
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"bf16"
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1140,
783
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
59
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 33,
"type": "GetImageSizeAndCount",
"pos": [
1189,
134
],
"size": {
"0": 210,
"1": 86
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 59
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
60
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "480 height",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "122 count",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 22,
"type": "CogVideoSampler",
"pos": [
1041,
342
],
"size": {
"0": 315,
"1": 382
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 36
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
37
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
38
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
128,
8,
25,
6,
806286757407563,
"fixed",
"DDIM",
48,
12,
1
]
},
{
"id": 32,
"type": "VHS_VideoCombine",
"pos": [
1439,
122
],
"size": [
563.3333740234375,
686.2222493489583
],
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 60,
"slot_index": 0
},
{
"name": "audio",
"type": "VHS_AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "AnimateDiff",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "AnimateDiff_00002.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
"frame_rate": 8
}
}
}
}
],
"links": [
[
36,
1,
0,
22,
0,
"COGVIDEOPIPE"
],
[
37,
22,
0,
11,
0,
"COGVIDEOPIPE"
],
[
38,
22,
1,
11,
1,
"LATENT"
],
[
54,
20,
0,
30,
0,
"CLIP"
],
[
55,
30,
0,
22,
1,
"CONDITIONING"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
57,
31,
0,
22,
2,
"CONDITIONING"
],
[
59,
11,
0,
33,
0,
"IMAGE"
],
[
60,
33,
0,
32,
0,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.9090909090909091,
"offset": [
49.8551278885073,
87.4070604693312
]
}
},
"version": 0.4
}

View File

@ -1,6 +1,6 @@
{ {
"last_node_id": 69, "last_node_id": 70,
"last_link_id": 176, "last_link_id": 181,
"nodes": [ "nodes": [
{ {
"id": 20, "id": 20,
@ -48,7 +48,7 @@
"1": 86 "1": 86
}, },
"flags": {}, "flags": {},
"order": 13, "order": 12,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
@ -81,7 +81,7 @@
"shape": 3 "shape": 3
}, },
{ {
"name": "25 count", "name": "26 count",
"type": "INT", "type": "INT",
"links": [ "links": [
121 121
@ -166,47 +166,6 @@
"bf16" "bf16"
] ]
}, },
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1201,
684
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 12,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 166
},
{
"name": "samples",
"type": "LATENT",
"link": 167
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{ {
"id": 41, "id": 41,
"type": "ImageResizeKJ", "type": "ImageResizeKJ",
@ -225,7 +184,7 @@
{ {
"name": "image", "name": "image",
"type": "IMAGE", "type": "IMAGE",
"link": 128 "link": 180
}, },
{ {
"name": "get_image_size", "name": "get_image_size",
@ -328,124 +287,6 @@
"Node name for S&R": "CogVideoImageEncode" "Node name for S&R": "CogVideoImageEncode"
} }
}, },
{
"id": 57,
"type": "GetImageSizeAndCount",
"pos": [
603,
-65
],
"size": [
202.21431350127853,
99.2360176040001
],
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 126,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
129,
136
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": [
165
],
"shape": 3,
"slot_index": 1
},
{
"name": "480 height",
"type": "INT",
"links": [
164
],
"shape": 3,
"slot_index": 2
},
{
"name": "28 count",
"type": "INT",
"links": [
171,
173
],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 67,
"type": "SimpleMath+",
"pos": [
665,
98
],
"size": {
"0": 315,
"1": 78
},
"flags": {
"collapsed": true
},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "a",
"type": "INT,FLOAT",
"link": 173
},
{
"name": "b",
"type": "INT,FLOAT",
"link": null
}
],
"outputs": [
{
"name": "INT",
"type": "INT",
"links": [
174
],
"shape": 3,
"slot_index": 0
},
{
"name": "FLOAT",
"type": "FLOAT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "SimpleMath+"
},
"widgets_values": [
"a - 4"
]
},
{ {
"id": 59, "id": 59,
"type": "GetImageRangeFromBatch", "type": "GetImageRangeFromBatch",
@ -460,7 +301,7 @@
"flags": { "flags": {
"collapsed": true "collapsed": true
}, },
"order": 15, "order": 14,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
@ -520,7 +361,7 @@
"1": 102 "1": 102
}, },
"flags": {}, "flags": {},
"order": 16, "order": 15,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
@ -567,7 +408,7 @@
"flags": { "flags": {
"collapsed": true "collapsed": true
}, },
"order": 14, "order": 13,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
@ -605,184 +446,6 @@
"a - b" "a - b"
] ]
}, },
{
"id": 45,
"type": "VHS_LoadVideo",
"pos": [
-93,
-153
],
"size": [
235.1999969482422,
371.5999984741211
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
},
{
"name": "frame_load_cap",
"type": "INT",
"link": 176,
"widget": {
"name": "frame_load_cap"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
128
],
"shape": 3,
"slot_index": 0
},
{
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "jeep.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 20,
"skip_first_frames": 0,
"select_every_nth": 1,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 20,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "jeep.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 1
}
}
}
},
{
"id": 68,
"type": "SimpleMath+",
"pos": [
-75,
-197
],
"size": {
"0": 315,
"1": 78
},
"flags": {
"collapsed": true
},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "a",
"type": "INT,FLOAT",
"link": 175,
"slot_index": 0
},
{
"name": "b",
"type": "INT,FLOAT",
"link": null
}
],
"outputs": [
{
"name": "INT",
"type": "INT",
"links": [
176
],
"shape": 3,
"slot_index": 0
},
{
"name": "FLOAT",
"type": "FLOAT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "SimpleMath+"
},
"widgets_values": [
"a + 4"
]
},
{
"id": 69,
"type": "INTConstant",
"pos": [
-90,
-305
],
"size": [
200,
58
],
"flags": {},
"order": 2,
"mode": 0,
"outputs": [
{
"name": "value",
"type": "INT",
"links": [
175
],
"shape": 3
}
],
"title": "Frames to load",
"properties": {
"Node name for S&R": "INTConstant"
},
"widgets_values": [
24
],
"color": "#1b4669",
"bgcolor": "#29699c"
},
{ {
"id": 47, "id": 47,
"type": "VHS_VideoCombine", "type": "VHS_VideoCombine",
@ -795,7 +458,7 @@
711.3333333333333 711.3333333333333
], ],
"flags": {}, "flags": {},
"order": 17, "order": 16,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
@ -854,6 +517,47 @@
} }
} }
}, },
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1224,
737
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 11,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 166
},
{
"name": "samples",
"type": "LATENT",
"link": 167
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{ {
"id": 30, "id": 30,
"type": "CogVideoTextEncode", "type": "CogVideoTextEncode",
@ -890,9 +594,259 @@
"Node name for S&R": "CogVideoTextEncode" "Node name for S&R": "CogVideoTextEncode"
}, },
"widgets_values": [ "widgets_values": [
"A high-definition nature video showcasing a vibrant red panda as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The red pandas fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness, occasionally pausing to drink from the water or look around curiously." "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness."
] ]
}, },
{
"id": 57,
"type": "GetImageSizeAndCount",
"pos": [
603,
-65
],
"size": {
"0": 202.2143096923828,
"1": 99.23601531982422
},
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 126,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
129,
136
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": [
165
],
"shape": 3,
"slot_index": 1
},
{
"name": "480 height",
"type": "INT",
"links": [
164
],
"shape": 3,
"slot_index": 2
},
{
"name": "32 count",
"type": "INT",
"links": [
171,
178,
181
],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 45,
"type": "VHS_LoadVideo",
"pos": [
-93,
-153
],
"size": [
235.1999969482422,
359.5999984741211
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
},
{
"name": "frame_load_cap",
"type": "INT",
"link": 177,
"widget": {
"name": "frame_load_cap"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
179
],
"shape": 3,
"slot_index": 0
},
{
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "jeep.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 20,
"skip_first_frames": 0,
"select_every_nth": 1,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 20,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "jeep.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 1
}
}
}
},
{
"id": 70,
"type": "GetImageSizeAndCount",
"pos": [
214,
-234
],
"size": {
"0": 202.2143096923828,
"1": 99.23601531982422
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 179,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
180
],
"shape": 3,
"slot_index": 0
},
{
"name": "512 width",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 1
},
{
"name": "256 height",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 2
},
{
"name": "32 count",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 69,
"type": "INTConstant",
"pos": [
-90,
-305
],
"size": {
"0": 210,
"1": 58
},
"flags": {},
"order": 2,
"mode": 0,
"outputs": [
{
"name": "value",
"type": "INT",
"links": [
177
],
"shape": 3
}
],
"title": "Frames to load",
"properties": {
"Node name for S&R": "INTConstant"
},
"widgets_values": [
32
],
"color": "#1b4669",
"bgcolor": "#29699c"
},
{ {
"id": 64, "id": 64,
"type": "CogVideoSampler", "type": "CogVideoSampler",
@ -902,10 +856,10 @@
], ],
"size": [ "size": [
315, 315,
342 370
], ],
"flags": {}, "flags": {},
"order": 11, "order": 10,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
@ -947,10 +901,19 @@
{ {
"name": "num_frames", "name": "num_frames",
"type": "INT", "type": "INT",
"link": 174, "link": 178,
"widget": { "widget": {
"name": "num_frames" "name": "num_frames"
} }
},
{
"name": "t_tile_length",
"type": "INT",
"link": 181,
"widget": {
"name": "t_tile_length"
},
"slot_index": 7
} }
], ],
"outputs": [ "outputs": [
@ -979,12 +942,14 @@
720, 720,
16, 16,
8, 8,
50, 25,
9, 9,
12, 13,
"fixed", "fixed",
"DPM", "DDIM",
0.81 32,
2,
0.8
] ]
} }
], ],
@ -1037,14 +1002,6 @@
0, 0,
"IMAGE" "IMAGE"
], ],
[
128,
45,
0,
41,
0,
"IMAGE"
],
[ [
129, 129,
57, 57,
@ -1166,35 +1123,43 @@
"LATENT" "LATENT"
], ],
[ [
173, 177,
57, 69,
3,
67,
0, 0,
"INT,FLOAT" 45,
2,
"INT"
], ],
[ [
174, 178,
67, 57,
0, 3,
64, 64,
6, 6,
"INT" "INT"
], ],
[ [
175, 179,
69, 45,
0, 0,
68, 70,
0, 0,
"INT,FLOAT" "IMAGE"
], ],
[ [
176, 180,
68, 70,
0, 0,
45, 41,
2, 0,
"IMAGE"
],
[
181,
57,
3,
64,
7,
"INT" "INT"
] ]
], ],
@ -1204,8 +1169,8 @@
"ds": { "ds": {
"scale": 0.7513148009015777, "scale": 0.7513148009015777,
"offset": [ "offset": [
281.39770788130244, 177.74090581831425,
559.6153930987157 461.56507330501444
] ]
} }
}, },

View File

@ -2,77 +2,6 @@
"last_node_id": 31, "last_node_id": 31,
"last_link_id": 57, "last_link_id": 57,
"nodes": [ "nodes": [
{
"id": 22,
"type": "CogVideoSampler",
"pos": [
1041,
342
],
"size": {
"0": 315,
"1": 334
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 36
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
37
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
38
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
16,
8,
25,
6,
806286757407561,
"fixed",
"DDIM",
1
]
},
{ {
"id": 28, "id": 28,
"type": "VHS_VideoCombine", "type": "VHS_VideoCombine",
@ -82,7 +11,7 @@
], ],
"size": [ "size": [
667.752197265625, 667.752197265625,
755.8347981770833 310
], ],
"flags": {}, "flags": {},
"order": 6, "order": 6,
@ -292,8 +221,8 @@
"id": 11, "id": 11,
"type": "CogVideoDecode", "type": "CogVideoDecode",
"pos": [ "pos": [
1138, 1140,
725 783
], ],
"size": { "size": {
"0": 210, "0": 210,
@ -328,6 +257,79 @@
"properties": { "properties": {
"Node name for S&R": "CogVideoDecode" "Node name for S&R": "CogVideoDecode"
} }
},
{
"id": 22,
"type": "CogVideoSampler",
"pos": [
1041,
342
],
"size": {
"0": 315,
"1": 382
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 36
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
37
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
38
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
16,
8,
25,
6,
806286757407561,
"fixed",
"DDIM",
16,
2,
1
]
} }
], ],
"links": [ "links": [
@ -400,10 +402,10 @@
"config": {}, "config": {},
"extra": { "extra": {
"ds": { "ds": {
"scale": 0.9090909090909092, "scale": 0.8264462809917356,
"offset": [ "offset": [
12.99028921497383, 253.92700064075518,
38.21608107136124 186.82608107136124
] ]
} }
}, },

View File

@ -153,17 +153,17 @@ class CogVideoImageEncode:
vae = pipeline["pipe"].vae vae = pipeline["pipe"].vae
vae.to(device) vae.to(device)
image = image * 2.0 - 1.0 input_image = image.clone() * 2.0 - 1.0
image = image.to(vae.dtype).to(device) input_image = input_image.to(vae.dtype).to(device)
image = image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
B, C, T, H, W = image.shape B, C, T, H, W = input_image.shape
chunk_size = 16 chunk_size = 16
latents_list = [] latents_list = []
# Loop through the temporal dimension in chunks of 16 # Loop through the temporal dimension in chunks of 16
for i in range(0, T, chunk_size): for i in range(0, T, chunk_size):
# Get the chunk of 16 frames (or remaining frames if less than 16 are left) # Get the chunk of 16 frames (or remaining frames if less than 16 are left)
end_index = min(i + chunk_size, T) end_index = min(i + chunk_size, T)
image_chunk = image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W] image_chunk = input_image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W]
# Encode the chunk of images # Encode the chunk of images
latents = vae.encode(image_chunk) latents = vae.encode(image_chunk)
@ -179,6 +179,7 @@ class CogVideoImageEncode:
latents = vae.config.scaling_factor * latents latents = vae.config.scaling_factor * latents
latents = latents.permute(0, 2, 1, 3, 4) # B, T_chunk, C, H, W latents = latents.permute(0, 2, 1, 3, 4) # B, T_chunk, C, H, W
latents_list.append(latents) latents_list.append(latents)
vae.clear_fake_context_parallel_cache()
# Concatenate all the chunks along the temporal dimension # Concatenate all the chunks along the temporal dimension
final_latents = torch.cat(latents_list, dim=1) final_latents = torch.cat(latents_list, dim=1)
@ -198,12 +199,14 @@ class CogVideoSampler:
"negative": ("CONDITIONING", ), "negative": ("CONDITIONING", ),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}), "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}), "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"num_frames": ("INT", {"default": 48, "min": 8, "max": 100, "step": 8}), "num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}),
"fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}), "fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
"steps": ("INT", {"default": 25, "min": 1}), "steps": ("INT", {"default": 25, "min": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
"scheduler": (["DDIM", "DPM"],), "scheduler": (["DDIM", "DPM"],),
"t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}),
"t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}),
}, },
"optional": { "optional": {
"samples": ("LATENT", ), "samples": ("LATENT", ),
@ -216,14 +219,20 @@ class CogVideoSampler:
FUNCTION = "process" FUNCTION = "process"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0): def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
mm.soft_empty_cache() mm.soft_empty_cache()
assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
assert t_tile_length <= num_frames, "t_tile_length must be equal or less than num_frames"
t_tile_length = t_tile_length // 4
t_tile_overlap = t_tile_overlap // 4
device = mm.get_torch_device() device = mm.get_torch_device()
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
pipe = pipeline["pipe"] pipe = pipeline["pipe"]
dtype = pipeline["dtype"] dtype = pipeline["dtype"]
base_path = pipeline["base_path"] base_path = pipeline["base_path"]
pipe.transformer.to(device) pipe.transformer.to(device)
generator = torch.Generator(device=device).manual_seed(seed) generator = torch.Generator(device=device).manual_seed(seed)
@ -237,6 +246,8 @@ class CogVideoSampler:
height = height, height = height,
width = width, width = width,
num_frames = num_frames, num_frames = num_frames,
t_tile_length = t_tile_length,
t_tile_overlap = t_tile_overlap,
fps = fps, fps = fps,
guidance_scale=cfg, guidance_scale=cfg,
latents=samples["samples"] if samples is not None else None, latents=samples["samples"] if samples is not None else None,

View File

@ -218,6 +218,16 @@ class CogVideoXPipeline(DiffusionPipeline):
self.scheduler.set_begin_index(t_start * self.scheduler.order) self.scheduler.set_begin_index(t_start * self.scheduler.order)
return timesteps.to(device), num_inference_steps - t_start return timesteps.to(device), num_inference_steps - t_start
def _gaussian_weights(self, t_tile_length, t_batch_size):
from numpy import pi, exp, sqrt
var = 0.01
midpoint = (t_tile_length - 1) / 2 # -1 because index goes from 0 to latent_width - 1
t_probs = [exp(-(t-midpoint)*(t-midpoint)/(t_tile_length*t_tile_length)/(2*var)) / sqrt(2*pi*var) for t in range(t_tile_length)]
weights = torch.tensor(t_probs)
weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
return weights
@property @property
def guidance_scale(self): def guidance_scale(self):
@ -244,6 +254,8 @@ class CogVideoXPipeline(DiffusionPipeline):
height: int = 480, height: int = 480,
width: int = 720, width: int = 720,
num_frames: int = 48, num_frames: int = 48,
t_tile_length: int = 12,
t_tile_overlap: int = 4,
fps: int = 8, fps: int = 8,
num_inference_steps: int = 50, num_inference_steps: int = 50,
timesteps: Optional[List[int]] = None, timesteps: Optional[List[int]] = None,
@ -301,9 +313,9 @@ class CogVideoXPipeline(DiffusionPipeline):
argument. argument.
""" """
assert ( #assert (
num_frames <= 48 and num_frames % fps == 0 and fps == 8 # num_frames <= 48 and num_frames % fps == 0 and fps == 8
), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX." #), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
@ -337,7 +349,10 @@ class CogVideoXPipeline(DiffusionPipeline):
# 5. Prepare latents. # 5. Prepare latents.
latent_channels = self.transformer.config.in_channels latent_channels = self.transformer.config.in_channels
num_frames += 1
if latents is None and num_frames == t_tile_length:
num_frames += 1
latents, timesteps = self.prepare_latents( latents, timesteps = self.prepare_latents(
batch_size * num_videos_per_prompt, batch_size * num_videos_per_prompt,
latent_channels, latent_channels,
@ -356,6 +371,9 @@ class CogVideoXPipeline(DiffusionPipeline):
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
print("latents.shape", latents.shape)
print("latents.device", latents.device)
# 7. Denoising loop # 7. Denoising loop
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
comfy_pbar = ProgressBar(num_inference_steps) comfy_pbar = ProgressBar(num_inference_steps)
@ -365,45 +383,90 @@ class CogVideoXPipeline(DiffusionPipeline):
for i, t in enumerate(timesteps): for i, t in enumerate(timesteps):
if self.interrupt: if self.interrupt:
continue continue
#temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
# =====================================================
grid_ts = 0
cur_t = 0
while cur_t < latents.shape[1]:
cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length
grid_ts += 1
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents all_t = latents.shape[1]
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) latents_all_list = []
# =====================================================
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML for t_i in range(grid_ts):
timestep = t.expand(latent_model_input.shape[0]) if t_i < grid_ts - 1:
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
# predict noise model_output input_start_t = ofs_t
noise_pred = self.transformer( input_end_t = ofs_t + t_tile_length
hidden_states=latent_model_input,
encoder_hidden_states=prompt_embeds,
timestep=timestep,
return_dict=False,
)[0]
noise_pred = noise_pred.float()
# perform guidance #latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
# self._guidance_scale = 1 + guidance_scale * ( #latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
# )
# print(self._guidance_scale)
if self.do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1 latents_tile = latents[:, input_start_t:input_end_t,:, :, :]
if not isinstance(self.scheduler, CogVideoXDPMScheduler): latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t)
else:
latents, old_pred_original_sample = self.scheduler.step( #t_input = t[None].to(device)
noise_pred, t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
old_pred_original_sample,
t, # predict noise model_output
timesteps[i - 1] if i > 0 else None, noise_pred = self.transformer(
latents, hidden_states=latent_model_input_tile,
**extra_step_kwargs, encoder_hidden_states=prompt_embeds,
timestep=t_input,
return_dict=False, return_dict=False,
) )[0]
latents = latents.to(prompt_embeds.dtype) noise_pred = noise_pred.float()
if self.do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
else:
raise NotImplementedError("DPM is not supported with temporal tiling")
# else:
# latents_tile, old_pred_original_sample = self.scheduler.step(
# noise_pred,
# old_pred_original_sample,
# t,
# t_input[t_i - 1] if t_i > 0 else None,
# latents_tile,
# **extra_step_kwargs,
# return_dict=False,
# )
latents_all_list.append(latents_tile)
# ==========================================
latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
# Add each tile contribution to overall latents
for t_i in range(grid_ts):
if t_i < grid_ts - 1:
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
input_start_t = ofs_t
input_end_t = ofs_t + t_tile_length
latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights
contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights
latents_all /= contributors
latents = latents_all
# ==========================================
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update() progress_bar.update()