mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2026-01-24 03:14:22 +08:00
temporal tiling for longer outputs
This commit is contained in:
parent
b602a015bb
commit
bbfaee3adb
475
examples/cogvideo_long_01.json
Normal file
475
examples/cogvideo_long_01.json
Normal file
@ -0,0 +1,475 @@
|
||||
{
|
||||
"last_node_id": 33,
|
||||
"last_link_id": 60,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 30,
|
||||
"type": "CogVideoTextEncode",
|
||||
"pos": [
|
||||
500,
|
||||
308
|
||||
],
|
||||
"size": {
|
||||
"0": 474.8450012207031,
|
||||
"1": 164.7423553466797
|
||||
},
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 54
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
55
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoTextEncode"
|
||||
},
|
||||
"widgets_values": [
|
||||
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
-59,
|
||||
397
|
||||
],
|
||||
"size": {
|
||||
"0": 451.30548095703125,
|
||||
"1": 82
|
||||
},
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
54,
|
||||
56
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPLoader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
|
||||
"sd3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "CogVideoTextEncode",
|
||||
"pos": [
|
||||
503,
|
||||
521
|
||||
],
|
||||
"size": {
|
||||
"0": 463.01251220703125,
|
||||
"1": 98.10446166992188
|
||||
},
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 56
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
57
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoTextEncode"
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"type": "DownloadAndLoadCogVideoModel",
|
||||
"pos": [
|
||||
649,
|
||||
182
|
||||
],
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 58
|
||||
},
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
36
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DownloadAndLoadCogVideoModel"
|
||||
},
|
||||
"widgets_values": [
|
||||
"bf16"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CogVideoDecode",
|
||||
"pos": [
|
||||
1140,
|
||||
783
|
||||
],
|
||||
"size": {
|
||||
"0": 210,
|
||||
"1": 46
|
||||
},
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 37
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": 38
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
59
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoDecode"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 33,
|
||||
"type": "GetImageSizeAndCount",
|
||||
"pos": [
|
||||
1189,
|
||||
134
|
||||
],
|
||||
"size": {
|
||||
"0": 210,
|
||||
"1": 86
|
||||
},
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 59
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
60
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "720 width",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "480 height",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "122 count",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSizeAndCount"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"type": "CogVideoSampler",
|
||||
"pos": [
|
||||
1041,
|
||||
342
|
||||
],
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 382
|
||||
},
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 36
|
||||
},
|
||||
{
|
||||
"name": "positive",
|
||||
"type": "CONDITIONING",
|
||||
"link": 55,
|
||||
"slot_index": 1
|
||||
},
|
||||
{
|
||||
"name": "negative",
|
||||
"type": "CONDITIONING",
|
||||
"link": 57
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
37
|
||||
],
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"links": [
|
||||
38
|
||||
],
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoSampler"
|
||||
},
|
||||
"widgets_values": [
|
||||
480,
|
||||
720,
|
||||
128,
|
||||
8,
|
||||
25,
|
||||
6,
|
||||
806286757407563,
|
||||
"fixed",
|
||||
"DDIM",
|
||||
48,
|
||||
12,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"type": "VHS_VideoCombine",
|
||||
"pos": [
|
||||
1439,
|
||||
122
|
||||
],
|
||||
"size": [
|
||||
563.3333740234375,
|
||||
686.2222493489583
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 60,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "VHS_AUDIO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "meta_batch",
|
||||
"type": "VHS_BatchManager",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "vae",
|
||||
"type": "VAE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "Filenames",
|
||||
"type": "VHS_FILENAMES",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VHS_VideoCombine"
|
||||
},
|
||||
"widgets_values": {
|
||||
"frame_rate": 8,
|
||||
"loop_count": 0,
|
||||
"filename_prefix": "AnimateDiff",
|
||||
"format": "video/h264-mp4",
|
||||
"pix_fmt": "yuv420p",
|
||||
"crf": 19,
|
||||
"save_metadata": true,
|
||||
"pingpong": false,
|
||||
"save_output": false,
|
||||
"videopreview": {
|
||||
"hidden": false,
|
||||
"paused": false,
|
||||
"params": {
|
||||
"filename": "AnimateDiff_00002.mp4",
|
||||
"subfolder": "",
|
||||
"type": "temp",
|
||||
"format": "video/h264-mp4",
|
||||
"frame_rate": 8
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"links": [
|
||||
[
|
||||
36,
|
||||
1,
|
||||
0,
|
||||
22,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
37,
|
||||
22,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
38,
|
||||
22,
|
||||
1,
|
||||
11,
|
||||
1,
|
||||
"LATENT"
|
||||
],
|
||||
[
|
||||
54,
|
||||
20,
|
||||
0,
|
||||
30,
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
55,
|
||||
30,
|
||||
0,
|
||||
22,
|
||||
1,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
56,
|
||||
20,
|
||||
0,
|
||||
31,
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
57,
|
||||
31,
|
||||
0,
|
||||
22,
|
||||
2,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
59,
|
||||
11,
|
||||
0,
|
||||
33,
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
60,
|
||||
33,
|
||||
0,
|
||||
32,
|
||||
0,
|
||||
"IMAGE"
|
||||
]
|
||||
],
|
||||
"groups": [],
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 0.9090909090909091,
|
||||
"offset": [
|
||||
49.8551278885073,
|
||||
87.4070604693312
|
||||
]
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"last_node_id": 69,
|
||||
"last_link_id": 176,
|
||||
"last_node_id": 70,
|
||||
"last_link_id": 181,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 20,
|
||||
@ -48,7 +48,7 @@
|
||||
"1": 86
|
||||
},
|
||||
"flags": {},
|
||||
"order": 13,
|
||||
"order": 12,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -81,7 +81,7 @@
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "25 count",
|
||||
"name": "26 count",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
121
|
||||
@ -166,47 +166,6 @@
|
||||
"bf16"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CogVideoDecode",
|
||||
"pos": [
|
||||
1201,
|
||||
684
|
||||
],
|
||||
"size": {
|
||||
"0": 210,
|
||||
"1": 46
|
||||
},
|
||||
"flags": {},
|
||||
"order": 12,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 166
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": 167
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
118
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoDecode"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "ImageResizeKJ",
|
||||
@ -225,7 +184,7 @@
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 128
|
||||
"link": 180
|
||||
},
|
||||
{
|
||||
"name": "get_image_size",
|
||||
@ -328,124 +287,6 @@
|
||||
"Node name for S&R": "CogVideoImageEncode"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 57,
|
||||
"type": "GetImageSizeAndCount",
|
||||
"pos": [
|
||||
603,
|
||||
-65
|
||||
],
|
||||
"size": [
|
||||
202.21431350127853,
|
||||
99.2360176040001
|
||||
],
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 126,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
129,
|
||||
136
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "720 width",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
165
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 1
|
||||
},
|
||||
{
|
||||
"name": "480 height",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
164
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 2
|
||||
},
|
||||
{
|
||||
"name": "28 count",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
171,
|
||||
173
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSizeAndCount"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 67,
|
||||
"type": "SimpleMath+",
|
||||
"pos": [
|
||||
665,
|
||||
98
|
||||
],
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 78
|
||||
},
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
},
|
||||
"order": 10,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "a",
|
||||
"type": "INT,FLOAT",
|
||||
"link": 173
|
||||
},
|
||||
{
|
||||
"name": "b",
|
||||
"type": "INT,FLOAT",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
174
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SimpleMath+"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a - 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 59,
|
||||
"type": "GetImageRangeFromBatch",
|
||||
@ -460,7 +301,7 @@
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
},
|
||||
"order": 15,
|
||||
"order": 14,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -520,7 +361,7 @@
|
||||
"1": 102
|
||||
},
|
||||
"flags": {},
|
||||
"order": 16,
|
||||
"order": 15,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -567,7 +408,7 @@
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
},
|
||||
"order": 14,
|
||||
"order": 13,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -605,184 +446,6 @@
|
||||
"a - b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 45,
|
||||
"type": "VHS_LoadVideo",
|
||||
"pos": [
|
||||
-93,
|
||||
-153
|
||||
],
|
||||
"size": [
|
||||
235.1999969482422,
|
||||
371.5999984741211
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "meta_batch",
|
||||
"type": "VHS_BatchManager",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "vae",
|
||||
"type": "VAE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "frame_load_cap",
|
||||
"type": "INT",
|
||||
"link": 176,
|
||||
"widget": {
|
||||
"name": "frame_load_cap"
|
||||
}
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
128
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "frame_count",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "VHS_AUDIO",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "video_info",
|
||||
"type": "VHS_VIDEOINFO",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VHS_LoadVideo"
|
||||
},
|
||||
"widgets_values": {
|
||||
"video": "jeep.mp4",
|
||||
"force_rate": 0,
|
||||
"force_size": "Disabled",
|
||||
"custom_width": 512,
|
||||
"custom_height": 512,
|
||||
"frame_load_cap": 20,
|
||||
"skip_first_frames": 0,
|
||||
"select_every_nth": 1,
|
||||
"choose video to upload": "image",
|
||||
"videopreview": {
|
||||
"hidden": false,
|
||||
"paused": false,
|
||||
"params": {
|
||||
"frame_load_cap": 20,
|
||||
"skip_first_frames": 0,
|
||||
"force_rate": 0,
|
||||
"filename": "jeep.mp4",
|
||||
"type": "input",
|
||||
"format": "video/mp4",
|
||||
"select_every_nth": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 68,
|
||||
"type": "SimpleMath+",
|
||||
"pos": [
|
||||
-75,
|
||||
-197
|
||||
],
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 78
|
||||
},
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "a",
|
||||
"type": "INT,FLOAT",
|
||||
"link": 175,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "b",
|
||||
"type": "INT,FLOAT",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
176
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SimpleMath+"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a + 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 69,
|
||||
"type": "INTConstant",
|
||||
"pos": [
|
||||
-90,
|
||||
-305
|
||||
],
|
||||
"size": [
|
||||
200,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
175
|
||||
],
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"title": "Frames to load",
|
||||
"properties": {
|
||||
"Node name for S&R": "INTConstant"
|
||||
},
|
||||
"widgets_values": [
|
||||
24
|
||||
],
|
||||
"color": "#1b4669",
|
||||
"bgcolor": "#29699c"
|
||||
},
|
||||
{
|
||||
"id": 47,
|
||||
"type": "VHS_VideoCombine",
|
||||
@ -795,7 +458,7 @@
|
||||
711.3333333333333
|
||||
],
|
||||
"flags": {},
|
||||
"order": 17,
|
||||
"order": 16,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -854,6 +517,47 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CogVideoDecode",
|
||||
"pos": [
|
||||
1224,
|
||||
737
|
||||
],
|
||||
"size": {
|
||||
"0": 210,
|
||||
"1": 46
|
||||
},
|
||||
"flags": {},
|
||||
"order": 11,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 166
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": 167
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
118
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoDecode"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "CogVideoTextEncode",
|
||||
@ -890,9 +594,259 @@
|
||||
"Node name for S&R": "CogVideoTextEncode"
|
||||
},
|
||||
"widgets_values": [
|
||||
"A high-definition nature video showcasing a vibrant red panda as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The red panda’s fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness, occasionally pausing to drink from the water or look around curiously."
|
||||
"A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness."
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 57,
|
||||
"type": "GetImageSizeAndCount",
|
||||
"pos": [
|
||||
603,
|
||||
-65
|
||||
],
|
||||
"size": {
|
||||
"0": 202.2143096923828,
|
||||
"1": 99.23601531982422
|
||||
},
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 126,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
129,
|
||||
136
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "720 width",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
165
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 1
|
||||
},
|
||||
{
|
||||
"name": "480 height",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
164
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 2
|
||||
},
|
||||
{
|
||||
"name": "32 count",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
171,
|
||||
178,
|
||||
181
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSizeAndCount"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 45,
|
||||
"type": "VHS_LoadVideo",
|
||||
"pos": [
|
||||
-93,
|
||||
-153
|
||||
],
|
||||
"size": [
|
||||
235.1999969482422,
|
||||
359.5999984741211
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "meta_batch",
|
||||
"type": "VHS_BatchManager",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "vae",
|
||||
"type": "VAE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "frame_load_cap",
|
||||
"type": "INT",
|
||||
"link": 177,
|
||||
"widget": {
|
||||
"name": "frame_load_cap"
|
||||
}
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
179
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "frame_count",
|
||||
"type": "INT",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "VHS_AUDIO",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "video_info",
|
||||
"type": "VHS_VIDEOINFO",
|
||||
"links": null,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VHS_LoadVideo"
|
||||
},
|
||||
"widgets_values": {
|
||||
"video": "jeep.mp4",
|
||||
"force_rate": 0,
|
||||
"force_size": "Disabled",
|
||||
"custom_width": 512,
|
||||
"custom_height": 512,
|
||||
"frame_load_cap": 20,
|
||||
"skip_first_frames": 0,
|
||||
"select_every_nth": 1,
|
||||
"choose video to upload": "image",
|
||||
"videopreview": {
|
||||
"hidden": false,
|
||||
"paused": false,
|
||||
"params": {
|
||||
"frame_load_cap": 20,
|
||||
"skip_first_frames": 0,
|
||||
"force_rate": 0,
|
||||
"filename": "jeep.mp4",
|
||||
"type": "input",
|
||||
"format": "video/mp4",
|
||||
"select_every_nth": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 70,
|
||||
"type": "GetImageSizeAndCount",
|
||||
"pos": [
|
||||
214,
|
||||
-234
|
||||
],
|
||||
"size": {
|
||||
"0": 202.2143096923828,
|
||||
"1": 99.23601531982422
|
||||
},
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 179,
|
||||
"slot_index": 0
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
180
|
||||
],
|
||||
"shape": 3,
|
||||
"slot_index": 0
|
||||
},
|
||||
{
|
||||
"name": "512 width",
|
||||
"type": "INT",
|
||||
"links": [],
|
||||
"shape": 3,
|
||||
"slot_index": 1
|
||||
},
|
||||
{
|
||||
"name": "256 height",
|
||||
"type": "INT",
|
||||
"links": [],
|
||||
"shape": 3,
|
||||
"slot_index": 2
|
||||
},
|
||||
{
|
||||
"name": "32 count",
|
||||
"type": "INT",
|
||||
"links": [],
|
||||
"shape": 3,
|
||||
"slot_index": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSizeAndCount"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 69,
|
||||
"type": "INTConstant",
|
||||
"pos": [
|
||||
-90,
|
||||
-305
|
||||
],
|
||||
"size": {
|
||||
"0": 210,
|
||||
"1": 58
|
||||
},
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
177
|
||||
],
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"title": "Frames to load",
|
||||
"properties": {
|
||||
"Node name for S&R": "INTConstant"
|
||||
},
|
||||
"widgets_values": [
|
||||
32
|
||||
],
|
||||
"color": "#1b4669",
|
||||
"bgcolor": "#29699c"
|
||||
},
|
||||
{
|
||||
"id": 64,
|
||||
"type": "CogVideoSampler",
|
||||
@ -902,10 +856,10 @@
|
||||
],
|
||||
"size": [
|
||||
315,
|
||||
342
|
||||
370
|
||||
],
|
||||
"flags": {},
|
||||
"order": 11,
|
||||
"order": 10,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -947,10 +901,19 @@
|
||||
{
|
||||
"name": "num_frames",
|
||||
"type": "INT",
|
||||
"link": 174,
|
||||
"link": 178,
|
||||
"widget": {
|
||||
"name": "num_frames"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "t_tile_length",
|
||||
"type": "INT",
|
||||
"link": 181,
|
||||
"widget": {
|
||||
"name": "t_tile_length"
|
||||
},
|
||||
"slot_index": 7
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -979,12 +942,14 @@
|
||||
720,
|
||||
16,
|
||||
8,
|
||||
50,
|
||||
25,
|
||||
9,
|
||||
12,
|
||||
13,
|
||||
"fixed",
|
||||
"DPM",
|
||||
0.81
|
||||
"DDIM",
|
||||
32,
|
||||
2,
|
||||
0.8
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -1037,14 +1002,6 @@
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
128,
|
||||
45,
|
||||
0,
|
||||
41,
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
129,
|
||||
57,
|
||||
@ -1166,35 +1123,43 @@
|
||||
"LATENT"
|
||||
],
|
||||
[
|
||||
173,
|
||||
57,
|
||||
3,
|
||||
67,
|
||||
177,
|
||||
69,
|
||||
0,
|
||||
"INT,FLOAT"
|
||||
45,
|
||||
2,
|
||||
"INT"
|
||||
],
|
||||
[
|
||||
174,
|
||||
67,
|
||||
0,
|
||||
178,
|
||||
57,
|
||||
3,
|
||||
64,
|
||||
6,
|
||||
"INT"
|
||||
],
|
||||
[
|
||||
175,
|
||||
69,
|
||||
179,
|
||||
45,
|
||||
0,
|
||||
68,
|
||||
70,
|
||||
0,
|
||||
"INT,FLOAT"
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
176,
|
||||
68,
|
||||
180,
|
||||
70,
|
||||
0,
|
||||
45,
|
||||
2,
|
||||
41,
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
181,
|
||||
57,
|
||||
3,
|
||||
64,
|
||||
7,
|
||||
"INT"
|
||||
]
|
||||
],
|
||||
@ -1204,8 +1169,8 @@
|
||||
"ds": {
|
||||
"scale": 0.7513148009015777,
|
||||
"offset": [
|
||||
281.39770788130244,
|
||||
559.6153930987157
|
||||
177.74090581831425,
|
||||
461.56507330501444
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
@ -2,77 +2,6 @@
|
||||
"last_node_id": 31,
|
||||
"last_link_id": 57,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 22,
|
||||
"type": "CogVideoSampler",
|
||||
"pos": [
|
||||
1041,
|
||||
342
|
||||
],
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 334
|
||||
},
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 36
|
||||
},
|
||||
{
|
||||
"name": "positive",
|
||||
"type": "CONDITIONING",
|
||||
"link": 55,
|
||||
"slot_index": 1
|
||||
},
|
||||
{
|
||||
"name": "negative",
|
||||
"type": "CONDITIONING",
|
||||
"link": 57
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
37
|
||||
],
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"links": [
|
||||
38
|
||||
],
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoSampler"
|
||||
},
|
||||
"widgets_values": [
|
||||
480,
|
||||
720,
|
||||
16,
|
||||
8,
|
||||
25,
|
||||
6,
|
||||
806286757407561,
|
||||
"fixed",
|
||||
"DDIM",
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"type": "VHS_VideoCombine",
|
||||
@ -82,7 +11,7 @@
|
||||
],
|
||||
"size": [
|
||||
667.752197265625,
|
||||
755.8347981770833
|
||||
310
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
@ -292,8 +221,8 @@
|
||||
"id": 11,
|
||||
"type": "CogVideoDecode",
|
||||
"pos": [
|
||||
1138,
|
||||
725
|
||||
1140,
|
||||
783
|
||||
],
|
||||
"size": {
|
||||
"0": 210,
|
||||
@ -328,6 +257,79 @@
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoDecode"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"type": "CogVideoSampler",
|
||||
"pos": [
|
||||
1041,
|
||||
342
|
||||
],
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 382
|
||||
},
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 36
|
||||
},
|
||||
{
|
||||
"name": "positive",
|
||||
"type": "CONDITIONING",
|
||||
"link": 55,
|
||||
"slot_index": 1
|
||||
},
|
||||
{
|
||||
"name": "negative",
|
||||
"type": "CONDITIONING",
|
||||
"link": 57
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
37
|
||||
],
|
||||
"shape": 3
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"links": [
|
||||
38
|
||||
],
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoSampler"
|
||||
},
|
||||
"widgets_values": [
|
||||
480,
|
||||
720,
|
||||
16,
|
||||
8,
|
||||
25,
|
||||
6,
|
||||
806286757407561,
|
||||
"fixed",
|
||||
"DDIM",
|
||||
16,
|
||||
2,
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [
|
||||
@ -400,10 +402,10 @@
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 0.9090909090909092,
|
||||
"scale": 0.8264462809917356,
|
||||
"offset": [
|
||||
12.99028921497383,
|
||||
38.21608107136124
|
||||
253.92700064075518,
|
||||
186.82608107136124
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
27
nodes.py
27
nodes.py
@ -153,17 +153,17 @@ class CogVideoImageEncode:
|
||||
vae = pipeline["pipe"].vae
|
||||
vae.to(device)
|
||||
|
||||
image = image * 2.0 - 1.0
|
||||
image = image.to(vae.dtype).to(device)
|
||||
image = image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
|
||||
B, C, T, H, W = image.shape
|
||||
input_image = image.clone() * 2.0 - 1.0
|
||||
input_image = input_image.to(vae.dtype).to(device)
|
||||
input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
|
||||
B, C, T, H, W = input_image.shape
|
||||
chunk_size = 16
|
||||
latents_list = []
|
||||
# Loop through the temporal dimension in chunks of 16
|
||||
for i in range(0, T, chunk_size):
|
||||
# Get the chunk of 16 frames (or remaining frames if less than 16 are left)
|
||||
end_index = min(i + chunk_size, T)
|
||||
image_chunk = image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W]
|
||||
image_chunk = input_image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W]
|
||||
|
||||
# Encode the chunk of images
|
||||
latents = vae.encode(image_chunk)
|
||||
@ -179,6 +179,7 @@ class CogVideoImageEncode:
|
||||
latents = vae.config.scaling_factor * latents
|
||||
latents = latents.permute(0, 2, 1, 3, 4) # B, T_chunk, C, H, W
|
||||
latents_list.append(latents)
|
||||
vae.clear_fake_context_parallel_cache()
|
||||
|
||||
# Concatenate all the chunks along the temporal dimension
|
||||
final_latents = torch.cat(latents_list, dim=1)
|
||||
@ -198,12 +199,14 @@ class CogVideoSampler:
|
||||
"negative": ("CONDITIONING", ),
|
||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
||||
"num_frames": ("INT", {"default": 48, "min": 8, "max": 100, "step": 8}),
|
||||
"num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}),
|
||||
"fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
|
||||
"steps": ("INT", {"default": 25, "min": 1}),
|
||||
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
||||
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
||||
"scheduler": (["DDIM", "DPM"],),
|
||||
"t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}),
|
||||
"t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}),
|
||||
},
|
||||
"optional": {
|
||||
"samples": ("LATENT", ),
|
||||
@ -216,14 +219,20 @@ class CogVideoSampler:
|
||||
FUNCTION = "process"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
|
||||
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
|
||||
mm.soft_empty_cache()
|
||||
|
||||
assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
|
||||
assert t_tile_length <= num_frames, "t_tile_length must be equal or less than num_frames"
|
||||
t_tile_length = t_tile_length // 4
|
||||
t_tile_overlap = t_tile_overlap // 4
|
||||
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
pipe = pipeline["pipe"]
|
||||
dtype = pipeline["dtype"]
|
||||
base_path = pipeline["base_path"]
|
||||
|
||||
|
||||
pipe.transformer.to(device)
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
@ -237,6 +246,8 @@ class CogVideoSampler:
|
||||
height = height,
|
||||
width = width,
|
||||
num_frames = num_frames,
|
||||
t_tile_length = t_tile_length,
|
||||
t_tile_overlap = t_tile_overlap,
|
||||
fps = fps,
|
||||
guidance_scale=cfg,
|
||||
latents=samples["samples"] if samples is not None else None,
|
||||
|
||||
@ -218,6 +218,16 @@ class CogVideoXPipeline(DiffusionPipeline):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps.to(device), num_inference_steps - t_start
|
||||
|
||||
def _gaussian_weights(self, t_tile_length, t_batch_size):
|
||||
from numpy import pi, exp, sqrt
|
||||
|
||||
var = 0.01
|
||||
midpoint = (t_tile_length - 1) / 2 # -1 because index goes from 0 to latent_width - 1
|
||||
t_probs = [exp(-(t-midpoint)*(t-midpoint)/(t_tile_length*t_tile_length)/(2*var)) / sqrt(2*pi*var) for t in range(t_tile_length)]
|
||||
weights = torch.tensor(t_probs)
|
||||
weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
|
||||
return weights
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
@ -244,6 +254,8 @@ class CogVideoXPipeline(DiffusionPipeline):
|
||||
height: int = 480,
|
||||
width: int = 720,
|
||||
num_frames: int = 48,
|
||||
t_tile_length: int = 12,
|
||||
t_tile_overlap: int = 4,
|
||||
fps: int = 8,
|
||||
num_inference_steps: int = 50,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
@ -301,9 +313,9 @@ class CogVideoXPipeline(DiffusionPipeline):
|
||||
argument.
|
||||
"""
|
||||
|
||||
assert (
|
||||
num_frames <= 48 and num_frames % fps == 0 and fps == 8
|
||||
), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
|
||||
#assert (
|
||||
# num_frames <= 48 and num_frames % fps == 0 and fps == 8
|
||||
#), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
|
||||
|
||||
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||
@ -337,7 +349,10 @@ class CogVideoXPipeline(DiffusionPipeline):
|
||||
|
||||
# 5. Prepare latents.
|
||||
latent_channels = self.transformer.config.in_channels
|
||||
num_frames += 1
|
||||
|
||||
if latents is None and num_frames == t_tile_length:
|
||||
num_frames += 1
|
||||
|
||||
latents, timesteps = self.prepare_latents(
|
||||
batch_size * num_videos_per_prompt,
|
||||
latent_channels,
|
||||
@ -356,6 +371,9 @@ class CogVideoXPipeline(DiffusionPipeline):
|
||||
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
|
||||
print("latents.shape", latents.shape)
|
||||
print("latents.device", latents.device)
|
||||
# 7. Denoising loop
|
||||
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
||||
comfy_pbar = ProgressBar(num_inference_steps)
|
||||
@ -365,45 +383,90 @@ class CogVideoXPipeline(DiffusionPipeline):
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
#temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
|
||||
# =====================================================
|
||||
grid_ts = 0
|
||||
cur_t = 0
|
||||
while cur_t < latents.shape[1]:
|
||||
cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length
|
||||
grid_ts += 1
|
||||
|
||||
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
all_t = latents.shape[1]
|
||||
latents_all_list = []
|
||||
# =====================================================
|
||||
|
||||
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
||||
timestep = t.expand(latent_model_input.shape[0])
|
||||
for t_i in range(grid_ts):
|
||||
if t_i < grid_ts - 1:
|
||||
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
|
||||
if t_i == grid_ts - 1:
|
||||
ofs_t = all_t - t_tile_length
|
||||
|
||||
# predict noise model_output
|
||||
noise_pred = self.transformer(
|
||||
hidden_states=latent_model_input,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep=timestep,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
noise_pred = noise_pred.float()
|
||||
input_start_t = ofs_t
|
||||
input_end_t = ofs_t + t_tile_length
|
||||
|
||||
# perform guidance
|
||||
# self._guidance_scale = 1 + guidance_scale * (
|
||||
# (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
|
||||
# )
|
||||
# print(self._guidance_scale)
|
||||
if self.do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
#latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
||||
#latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||
else:
|
||||
latents, old_pred_original_sample = self.scheduler.step(
|
||||
noise_pred,
|
||||
old_pred_original_sample,
|
||||
t,
|
||||
timesteps[i - 1] if i > 0 else None,
|
||||
latents,
|
||||
**extra_step_kwargs,
|
||||
latents_tile = latents[:, input_start_t:input_end_t,:, :, :]
|
||||
latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile
|
||||
latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t)
|
||||
|
||||
#t_input = t[None].to(device)
|
||||
t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
||||
|
||||
# predict noise model_output
|
||||
noise_pred = self.transformer(
|
||||
hidden_states=latent_model_input_tile,
|
||||
encoder_hidden_states=prompt_embeds,
|
||||
timestep=t_input,
|
||||
return_dict=False,
|
||||
)
|
||||
latents = latents.to(prompt_embeds.dtype)
|
||||
)[0]
|
||||
noise_pred = noise_pred.float()
|
||||
|
||||
if self.do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||
latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
|
||||
else:
|
||||
raise NotImplementedError("DPM is not supported with temporal tiling")
|
||||
# else:
|
||||
# latents_tile, old_pred_original_sample = self.scheduler.step(
|
||||
# noise_pred,
|
||||
# old_pred_original_sample,
|
||||
# t,
|
||||
# t_input[t_i - 1] if t_i > 0 else None,
|
||||
# latents_tile,
|
||||
# **extra_step_kwargs,
|
||||
# return_dict=False,
|
||||
# )
|
||||
|
||||
latents_all_list.append(latents_tile)
|
||||
|
||||
# ==========================================
|
||||
latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
|
||||
contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
|
||||
# Add each tile contribution to overall latents
|
||||
for t_i in range(grid_ts):
|
||||
if t_i < grid_ts - 1:
|
||||
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
|
||||
if t_i == grid_ts - 1:
|
||||
ofs_t = all_t - t_tile_length
|
||||
|
||||
input_start_t = ofs_t
|
||||
input_end_t = ofs_t + t_tile_length
|
||||
|
||||
latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights
|
||||
contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights
|
||||
|
||||
latents_all /= contributors
|
||||
|
||||
latents = latents_all
|
||||
# ==========================================
|
||||
|
||||
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user