temporal tiling for longer outputs

This commit is contained in:
kijai 2024-08-07 17:46:04 +03:00
parent b602a015bb
commit bbfaee3adb
5 changed files with 1018 additions and 502 deletions

View File

@ -0,0 +1,475 @@
{
"last_node_id": 33,
"last_link_id": 60,
"nodes": [
{
"id": 30,
"type": "CogVideoTextEncode",
"pos": [
500,
308
],
"size": {
"0": 474.8450012207031,
"1": 164.7423553466797
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 54
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
55
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
]
},
{
"id": 20,
"type": "CLIPLoader",
"pos": [
-59,
397
],
"size": {
"0": 451.30548095703125,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
503,
521
],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
57
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
""
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": [
649,
182
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
36
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"bf16"
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1140,
783
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
59
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 33,
"type": "GetImageSizeAndCount",
"pos": [
1189,
134
],
"size": {
"0": 210,
"1": 86
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 59
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
60
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "480 height",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "122 count",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 22,
"type": "CogVideoSampler",
"pos": [
1041,
342
],
"size": {
"0": 315,
"1": 382
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 36
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
37
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
38
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
128,
8,
25,
6,
806286757407563,
"fixed",
"DDIM",
48,
12,
1
]
},
{
"id": 32,
"type": "VHS_VideoCombine",
"pos": [
1439,
122
],
"size": [
563.3333740234375,
686.2222493489583
],
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 60,
"slot_index": 0
},
{
"name": "audio",
"type": "VHS_AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "AnimateDiff",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "AnimateDiff_00002.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
"frame_rate": 8
}
}
}
}
],
"links": [
[
36,
1,
0,
22,
0,
"COGVIDEOPIPE"
],
[
37,
22,
0,
11,
0,
"COGVIDEOPIPE"
],
[
38,
22,
1,
11,
1,
"LATENT"
],
[
54,
20,
0,
30,
0,
"CLIP"
],
[
55,
30,
0,
22,
1,
"CONDITIONING"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
57,
31,
0,
22,
2,
"CONDITIONING"
],
[
59,
11,
0,
33,
0,
"IMAGE"
],
[
60,
33,
0,
32,
0,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.9090909090909091,
"offset": [
49.8551278885073,
87.4070604693312
]
}
},
"version": 0.4
}

View File

@ -1,6 +1,6 @@
{
"last_node_id": 69,
"last_link_id": 176,
"last_node_id": 70,
"last_link_id": 181,
"nodes": [
{
"id": 20,
@ -48,7 +48,7 @@
"1": 86
},
"flags": {},
"order": 13,
"order": 12,
"mode": 0,
"inputs": [
{
@ -81,7 +81,7 @@
"shape": 3
},
{
"name": "25 count",
"name": "26 count",
"type": "INT",
"links": [
121
@ -166,47 +166,6 @@
"bf16"
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1201,
684
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 12,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 166
},
{
"name": "samples",
"type": "LATENT",
"link": 167
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 41,
"type": "ImageResizeKJ",
@ -225,7 +184,7 @@
{
"name": "image",
"type": "IMAGE",
"link": 128
"link": 180
},
{
"name": "get_image_size",
@ -328,124 +287,6 @@
"Node name for S&R": "CogVideoImageEncode"
}
},
{
"id": 57,
"type": "GetImageSizeAndCount",
"pos": [
603,
-65
],
"size": [
202.21431350127853,
99.2360176040001
],
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 126,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
129,
136
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": [
165
],
"shape": 3,
"slot_index": 1
},
{
"name": "480 height",
"type": "INT",
"links": [
164
],
"shape": 3,
"slot_index": 2
},
{
"name": "28 count",
"type": "INT",
"links": [
171,
173
],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 67,
"type": "SimpleMath+",
"pos": [
665,
98
],
"size": {
"0": 315,
"1": 78
},
"flags": {
"collapsed": true
},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "a",
"type": "INT,FLOAT",
"link": 173
},
{
"name": "b",
"type": "INT,FLOAT",
"link": null
}
],
"outputs": [
{
"name": "INT",
"type": "INT",
"links": [
174
],
"shape": 3,
"slot_index": 0
},
{
"name": "FLOAT",
"type": "FLOAT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "SimpleMath+"
},
"widgets_values": [
"a - 4"
]
},
{
"id": 59,
"type": "GetImageRangeFromBatch",
@ -460,7 +301,7 @@
"flags": {
"collapsed": true
},
"order": 15,
"order": 14,
"mode": 0,
"inputs": [
{
@ -520,7 +361,7 @@
"1": 102
},
"flags": {},
"order": 16,
"order": 15,
"mode": 0,
"inputs": [
{
@ -567,7 +408,7 @@
"flags": {
"collapsed": true
},
"order": 14,
"order": 13,
"mode": 0,
"inputs": [
{
@ -605,184 +446,6 @@
"a - b"
]
},
{
"id": 45,
"type": "VHS_LoadVideo",
"pos": [
-93,
-153
],
"size": [
235.1999969482422,
371.5999984741211
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
},
{
"name": "frame_load_cap",
"type": "INT",
"link": 176,
"widget": {
"name": "frame_load_cap"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
128
],
"shape": 3,
"slot_index": 0
},
{
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "jeep.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 20,
"skip_first_frames": 0,
"select_every_nth": 1,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 20,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "jeep.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 1
}
}
}
},
{
"id": 68,
"type": "SimpleMath+",
"pos": [
-75,
-197
],
"size": {
"0": 315,
"1": 78
},
"flags": {
"collapsed": true
},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "a",
"type": "INT,FLOAT",
"link": 175,
"slot_index": 0
},
{
"name": "b",
"type": "INT,FLOAT",
"link": null
}
],
"outputs": [
{
"name": "INT",
"type": "INT",
"links": [
176
],
"shape": 3,
"slot_index": 0
},
{
"name": "FLOAT",
"type": "FLOAT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "SimpleMath+"
},
"widgets_values": [
"a + 4"
]
},
{
"id": 69,
"type": "INTConstant",
"pos": [
-90,
-305
],
"size": [
200,
58
],
"flags": {},
"order": 2,
"mode": 0,
"outputs": [
{
"name": "value",
"type": "INT",
"links": [
175
],
"shape": 3
}
],
"title": "Frames to load",
"properties": {
"Node name for S&R": "INTConstant"
},
"widgets_values": [
24
],
"color": "#1b4669",
"bgcolor": "#29699c"
},
{
"id": 47,
"type": "VHS_VideoCombine",
@ -795,7 +458,7 @@
711.3333333333333
],
"flags": {},
"order": 17,
"order": 16,
"mode": 0,
"inputs": [
{
@ -854,6 +517,47 @@
}
}
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1224,
737
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 11,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 166
},
{
"name": "samples",
"type": "LATENT",
"link": 167
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 30,
"type": "CogVideoTextEncode",
@ -890,9 +594,259 @@
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"A high-definition nature video showcasing a vibrant red panda as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The red pandas fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness, occasionally pausing to drink from the water or look around curiously."
"A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness."
]
},
{
"id": 57,
"type": "GetImageSizeAndCount",
"pos": [
603,
-65
],
"size": {
"0": 202.2143096923828,
"1": 99.23601531982422
},
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 126,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
129,
136
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": [
165
],
"shape": 3,
"slot_index": 1
},
{
"name": "480 height",
"type": "INT",
"links": [
164
],
"shape": 3,
"slot_index": 2
},
{
"name": "32 count",
"type": "INT",
"links": [
171,
178,
181
],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 45,
"type": "VHS_LoadVideo",
"pos": [
-93,
-153
],
"size": [
235.1999969482422,
359.5999984741211
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
},
{
"name": "frame_load_cap",
"type": "INT",
"link": 177,
"widget": {
"name": "frame_load_cap"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
179
],
"shape": 3,
"slot_index": 0
},
{
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "VHS_AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "jeep.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 20,
"skip_first_frames": 0,
"select_every_nth": 1,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 20,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "jeep.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 1
}
}
}
},
{
"id": 70,
"type": "GetImageSizeAndCount",
"pos": [
214,
-234
],
"size": {
"0": 202.2143096923828,
"1": 99.23601531982422
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 179,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
180
],
"shape": 3,
"slot_index": 0
},
{
"name": "512 width",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 1
},
{
"name": "256 height",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 2
},
{
"name": "32 count",
"type": "INT",
"links": [],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 69,
"type": "INTConstant",
"pos": [
-90,
-305
],
"size": {
"0": 210,
"1": 58
},
"flags": {},
"order": 2,
"mode": 0,
"outputs": [
{
"name": "value",
"type": "INT",
"links": [
177
],
"shape": 3
}
],
"title": "Frames to load",
"properties": {
"Node name for S&R": "INTConstant"
},
"widgets_values": [
32
],
"color": "#1b4669",
"bgcolor": "#29699c"
},
{
"id": 64,
"type": "CogVideoSampler",
@ -902,10 +856,10 @@
],
"size": [
315,
342
370
],
"flags": {},
"order": 11,
"order": 10,
"mode": 0,
"inputs": [
{
@ -947,10 +901,19 @@
{
"name": "num_frames",
"type": "INT",
"link": 174,
"link": 178,
"widget": {
"name": "num_frames"
}
},
{
"name": "t_tile_length",
"type": "INT",
"link": 181,
"widget": {
"name": "t_tile_length"
},
"slot_index": 7
}
],
"outputs": [
@ -979,12 +942,14 @@
720,
16,
8,
50,
25,
9,
12,
13,
"fixed",
"DPM",
0.81
"DDIM",
32,
2,
0.8
]
}
],
@ -1037,14 +1002,6 @@
0,
"IMAGE"
],
[
128,
45,
0,
41,
0,
"IMAGE"
],
[
129,
57,
@ -1166,35 +1123,43 @@
"LATENT"
],
[
173,
57,
3,
67,
177,
69,
0,
"INT,FLOAT"
45,
2,
"INT"
],
[
174,
67,
0,
178,
57,
3,
64,
6,
"INT"
],
[
175,
69,
179,
45,
0,
68,
70,
0,
"INT,FLOAT"
"IMAGE"
],
[
176,
68,
180,
70,
0,
45,
2,
41,
0,
"IMAGE"
],
[
181,
57,
3,
64,
7,
"INT"
]
],
@ -1204,8 +1169,8 @@
"ds": {
"scale": 0.7513148009015777,
"offset": [
281.39770788130244,
559.6153930987157
177.74090581831425,
461.56507330501444
]
}
},

View File

@ -2,77 +2,6 @@
"last_node_id": 31,
"last_link_id": 57,
"nodes": [
{
"id": 22,
"type": "CogVideoSampler",
"pos": [
1041,
342
],
"size": {
"0": 315,
"1": 334
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 36
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
37
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
38
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
16,
8,
25,
6,
806286757407561,
"fixed",
"DDIM",
1
]
},
{
"id": 28,
"type": "VHS_VideoCombine",
@ -82,7 +11,7 @@
],
"size": [
667.752197265625,
755.8347981770833
310
],
"flags": {},
"order": 6,
@ -292,8 +221,8 @@
"id": 11,
"type": "CogVideoDecode",
"pos": [
1138,
725
1140,
783
],
"size": {
"0": 210,
@ -328,6 +257,79 @@
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 22,
"type": "CogVideoSampler",
"pos": [
1041,
342
],
"size": {
"0": 315,
"1": 382
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 36
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
37
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
38
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
16,
8,
25,
6,
806286757407561,
"fixed",
"DDIM",
16,
2,
1
]
}
],
"links": [
@ -400,10 +402,10 @@
"config": {},
"extra": {
"ds": {
"scale": 0.9090909090909092,
"scale": 0.8264462809917356,
"offset": [
12.99028921497383,
38.21608107136124
253.92700064075518,
186.82608107136124
]
}
},

View File

@ -153,17 +153,17 @@ class CogVideoImageEncode:
vae = pipeline["pipe"].vae
vae.to(device)
image = image * 2.0 - 1.0
image = image.to(vae.dtype).to(device)
image = image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
B, C, T, H, W = image.shape
input_image = image.clone() * 2.0 - 1.0
input_image = input_image.to(vae.dtype).to(device)
input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
B, C, T, H, W = input_image.shape
chunk_size = 16
latents_list = []
# Loop through the temporal dimension in chunks of 16
for i in range(0, T, chunk_size):
# Get the chunk of 16 frames (or remaining frames if less than 16 are left)
end_index = min(i + chunk_size, T)
image_chunk = image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W]
image_chunk = input_image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W]
# Encode the chunk of images
latents = vae.encode(image_chunk)
@ -179,6 +179,7 @@ class CogVideoImageEncode:
latents = vae.config.scaling_factor * latents
latents = latents.permute(0, 2, 1, 3, 4) # B, T_chunk, C, H, W
latents_list.append(latents)
vae.clear_fake_context_parallel_cache()
# Concatenate all the chunks along the temporal dimension
final_latents = torch.cat(latents_list, dim=1)
@ -198,12 +199,14 @@ class CogVideoSampler:
"negative": ("CONDITIONING", ),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"num_frames": ("INT", {"default": 48, "min": 8, "max": 100, "step": 8}),
"num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}),
"fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
"steps": ("INT", {"default": 25, "min": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
"scheduler": (["DDIM", "DPM"],),
"t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}),
"t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}),
},
"optional": {
"samples": ("LATENT", ),
@ -216,14 +219,20 @@ class CogVideoSampler:
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
mm.soft_empty_cache()
assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
assert t_tile_length <= num_frames, "t_tile_length must be equal or less than num_frames"
t_tile_length = t_tile_length // 4
t_tile_overlap = t_tile_overlap // 4
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
pipe = pipeline["pipe"]
dtype = pipeline["dtype"]
base_path = pipeline["base_path"]
pipe.transformer.to(device)
generator = torch.Generator(device=device).manual_seed(seed)
@ -237,6 +246,8 @@ class CogVideoSampler:
height = height,
width = width,
num_frames = num_frames,
t_tile_length = t_tile_length,
t_tile_overlap = t_tile_overlap,
fps = fps,
guidance_scale=cfg,
latents=samples["samples"] if samples is not None else None,

View File

@ -218,6 +218,16 @@ class CogVideoXPipeline(DiffusionPipeline):
self.scheduler.set_begin_index(t_start * self.scheduler.order)
return timesteps.to(device), num_inference_steps - t_start
def _gaussian_weights(self, t_tile_length, t_batch_size):
from numpy import pi, exp, sqrt
var = 0.01
midpoint = (t_tile_length - 1) / 2 # -1 because index goes from 0 to latent_width - 1
t_probs = [exp(-(t-midpoint)*(t-midpoint)/(t_tile_length*t_tile_length)/(2*var)) / sqrt(2*pi*var) for t in range(t_tile_length)]
weights = torch.tensor(t_probs)
weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
return weights
@property
def guidance_scale(self):
@ -244,6 +254,8 @@ class CogVideoXPipeline(DiffusionPipeline):
height: int = 480,
width: int = 720,
num_frames: int = 48,
t_tile_length: int = 12,
t_tile_overlap: int = 4,
fps: int = 8,
num_inference_steps: int = 50,
timesteps: Optional[List[int]] = None,
@ -301,9 +313,9 @@ class CogVideoXPipeline(DiffusionPipeline):
argument.
"""
assert (
num_frames <= 48 and num_frames % fps == 0 and fps == 8
), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
#assert (
# num_frames <= 48 and num_frames % fps == 0 and fps == 8
#), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
@ -337,7 +349,10 @@ class CogVideoXPipeline(DiffusionPipeline):
# 5. Prepare latents.
latent_channels = self.transformer.config.in_channels
num_frames += 1
if latents is None and num_frames == t_tile_length:
num_frames += 1
latents, timesteps = self.prepare_latents(
batch_size * num_videos_per_prompt,
latent_channels,
@ -356,6 +371,9 @@ class CogVideoXPipeline(DiffusionPipeline):
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
print("latents.shape", latents.shape)
print("latents.device", latents.device)
# 7. Denoising loop
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
comfy_pbar = ProgressBar(num_inference_steps)
@ -365,45 +383,90 @@ class CogVideoXPipeline(DiffusionPipeline):
for i, t in enumerate(timesteps):
if self.interrupt:
continue
#temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
# =====================================================
grid_ts = 0
cur_t = 0
while cur_t < latents.shape[1]:
cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length
grid_ts += 1
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
all_t = latents.shape[1]
latents_all_list = []
# =====================================================
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
timestep = t.expand(latent_model_input.shape[0])
for t_i in range(grid_ts):
if t_i < grid_ts - 1:
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
# predict noise model_output
noise_pred = self.transformer(
hidden_states=latent_model_input,
encoder_hidden_states=prompt_embeds,
timestep=timestep,
return_dict=False,
)[0]
noise_pred = noise_pred.float()
input_start_t = ofs_t
input_end_t = ofs_t + t_tile_length
# perform guidance
# self._guidance_scale = 1 + guidance_scale * (
# (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
# )
# print(self._guidance_scale)
if self.do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
#latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
#latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# compute the previous noisy sample x_t -> x_t-1
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
else:
latents, old_pred_original_sample = self.scheduler.step(
noise_pred,
old_pred_original_sample,
t,
timesteps[i - 1] if i > 0 else None,
latents,
**extra_step_kwargs,
latents_tile = latents[:, input_start_t:input_end_t,:, :, :]
latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile
latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t)
#t_input = t[None].to(device)
t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
# predict noise model_output
noise_pred = self.transformer(
hidden_states=latent_model_input_tile,
encoder_hidden_states=prompt_embeds,
timestep=t_input,
return_dict=False,
)
latents = latents.to(prompt_embeds.dtype)
)[0]
noise_pred = noise_pred.float()
if self.do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
else:
raise NotImplementedError("DPM is not supported with temporal tiling")
# else:
# latents_tile, old_pred_original_sample = self.scheduler.step(
# noise_pred,
# old_pred_original_sample,
# t,
# t_input[t_i - 1] if t_i > 0 else None,
# latents_tile,
# **extra_step_kwargs,
# return_dict=False,
# )
latents_all_list.append(latents_tile)
# ==========================================
latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
# Add each tile contribution to overall latents
for t_i in range(grid_ts):
if t_i < grid_ts - 1:
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
input_start_t = ofs_t
input_end_t = ofs_t + t_tile_length
latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights
contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights
latents_all /= contributors
latents = latents_all
# ==========================================
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()