From bbfaee3adb0773888c008f595ad36ff981cac462 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:46:04 +0300 Subject: [PATCH] temporal tiling for longer outputs --- examples/cogvideo_long_01.json | 475 ++++++++++++ .../cogvideo_vid2vid_test_example_01.json | 725 +++++++++--------- examples/example_01.json | 156 ++-- nodes.py | 27 +- pipeline_cogvideox.py | 137 +++- 5 files changed, 1018 insertions(+), 502 deletions(-) create mode 100644 examples/cogvideo_long_01.json diff --git a/examples/cogvideo_long_01.json b/examples/cogvideo_long_01.json new file mode 100644 index 0000000..fb2b920 --- /dev/null +++ b/examples/cogvideo_long_01.json @@ -0,0 +1,475 @@ +{ + "last_node_id": 33, + "last_link_id": 60, + "nodes": [ + { + "id": 30, + "type": "CogVideoTextEncode", + "pos": [ + 500, + 308 + ], + "size": { + "0": 474.8450012207031, + "1": 164.7423553466797 + }, + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 54 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 55 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance." + ] + }, + { + "id": 20, + "type": "CLIPLoader", + "pos": [ + -59, + 397 + ], + "size": { + "0": 451.30548095703125, + "1": 82 + }, + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 54, + 56 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", + "sd3" + ] + }, + { + "id": 31, + "type": "CogVideoTextEncode", + "pos": [ + 503, + 521 + ], + "size": { + "0": 463.01251220703125, + "1": 98.10446166992188 + }, + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 56 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 57 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "" + ] + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": [ + 649, + 182 + ], + "size": { + "0": 315, + "1": 58 + }, + "flags": {}, + "order": 1, + "mode": 0, + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 36 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "bf16" + ] + }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": [ + 1140, + 783 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 37 + }, + { + "name": "samples", + "type": "LATENT", + "link": 38 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 59 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + } + }, + { + "id": 33, + "type": "GetImageSizeAndCount", + "pos": [ + 1189, + 134 + ], + "size": { + "0": 210, + "1": 86 + }, + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 59 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 60 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "720 width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "480 height", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "122 count", + "type": "INT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + } + }, + { + "id": 22, + "type": "CogVideoSampler", + "pos": [ + 1041, + 342 + ], + "size": { + "0": 315, + "1": 382 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 36 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 55, + "slot_index": 1 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 57 + }, + { + "name": "samples", + "type": "LATENT", + "link": null + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 37 + ], + "shape": 3 + }, + { + "name": "samples", + "type": "LATENT", + "links": [ + 38 + ], + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoSampler" + }, + "widgets_values": [ + 480, + 720, + 128, + 8, + 25, + 6, + 806286757407563, + "fixed", + "DDIM", + 48, + 12, + 1 + ] + }, + { + "id": 32, + "type": "VHS_VideoCombine", + "pos": [ + 1439, + 122 + ], + "size": [ + 563.3333740234375, + 686.2222493489583 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 60, + "slot_index": 0 + }, + { + "name": "audio", + "type": "VHS_AUDIO", + "link": null + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 8, + "loop_count": 0, + "filename_prefix": "AnimateDiff", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "AnimateDiff_00002.mp4", + "subfolder": "", + "type": "temp", + "format": "video/h264-mp4", + "frame_rate": 8 + } + } + } + } + ], + "links": [ + [ + 36, + 1, + 0, + 22, + 0, + "COGVIDEOPIPE" + ], + [ + 37, + 22, + 0, + 11, + 0, + "COGVIDEOPIPE" + ], + [ + 38, + 22, + 1, + 11, + 1, + "LATENT" + ], + [ + 54, + 20, + 0, + 30, + 0, + "CLIP" + ], + [ + 55, + 30, + 0, + 22, + 1, + "CONDITIONING" + ], + [ + 56, + 20, + 0, + 31, + 0, + "CLIP" + ], + [ + 57, + 31, + 0, + 22, + 2, + "CONDITIONING" + ], + [ + 59, + 11, + 0, + 33, + 0, + "IMAGE" + ], + [ + 60, + 33, + 0, + 32, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.9090909090909091, + "offset": [ + 49.8551278885073, + 87.4070604693312 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/examples/cogvideo_vid2vid_test_example_01.json b/examples/cogvideo_vid2vid_test_example_01.json index 2416c9d..1746fda 100644 --- a/examples/cogvideo_vid2vid_test_example_01.json +++ b/examples/cogvideo_vid2vid_test_example_01.json @@ -1,6 +1,6 @@ { - "last_node_id": 69, - "last_link_id": 176, + "last_node_id": 70, + "last_link_id": 181, "nodes": [ { "id": 20, @@ -48,7 +48,7 @@ "1": 86 }, "flags": {}, - "order": 13, + "order": 12, "mode": 0, "inputs": [ { @@ -81,7 +81,7 @@ "shape": 3 }, { - "name": "25 count", + "name": "26 count", "type": "INT", "links": [ 121 @@ -166,47 +166,6 @@ "bf16" ] }, - { - "id": 11, - "type": "CogVideoDecode", - "pos": [ - 1201, - 684 - ], - "size": { - "0": 210, - "1": 46 - }, - "flags": {}, - "order": 12, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 166 - }, - { - "name": "samples", - "type": "LATENT", - "link": 167 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 118 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - } - }, { "id": 41, "type": "ImageResizeKJ", @@ -225,7 +184,7 @@ { "name": "image", "type": "IMAGE", - "link": 128 + "link": 180 }, { "name": "get_image_size", @@ -328,124 +287,6 @@ "Node name for S&R": "CogVideoImageEncode" } }, - { - "id": 57, - "type": "GetImageSizeAndCount", - "pos": [ - 603, - -65 - ], - "size": [ - 202.21431350127853, - 99.2360176040001 - ], - "flags": {}, - "order": 8, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 126, - "slot_index": 0 - } - ], - "outputs": [ - { - "name": "image", - "type": "IMAGE", - "links": [ - 129, - 136 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "720 width", - "type": "INT", - "links": [ - 165 - ], - "shape": 3, - "slot_index": 1 - }, - { - "name": "480 height", - "type": "INT", - "links": [ - 164 - ], - "shape": 3, - "slot_index": 2 - }, - { - "name": "28 count", - "type": "INT", - "links": [ - 171, - 173 - ], - "shape": 3, - "slot_index": 3 - } - ], - "properties": { - "Node name for S&R": "GetImageSizeAndCount" - } - }, - { - "id": 67, - "type": "SimpleMath+", - "pos": [ - 665, - 98 - ], - "size": { - "0": 315, - "1": 78 - }, - "flags": { - "collapsed": true - }, - "order": 10, - "mode": 0, - "inputs": [ - { - "name": "a", - "type": "INT,FLOAT", - "link": 173 - }, - { - "name": "b", - "type": "INT,FLOAT", - "link": null - } - ], - "outputs": [ - { - "name": "INT", - "type": "INT", - "links": [ - 174 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "FLOAT", - "type": "FLOAT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "SimpleMath+" - }, - "widgets_values": [ - "a - 4" - ] - }, { "id": 59, "type": "GetImageRangeFromBatch", @@ -460,7 +301,7 @@ "flags": { "collapsed": true }, - "order": 15, + "order": 14, "mode": 0, "inputs": [ { @@ -520,7 +361,7 @@ "1": 102 }, "flags": {}, - "order": 16, + "order": 15, "mode": 0, "inputs": [ { @@ -567,7 +408,7 @@ "flags": { "collapsed": true }, - "order": 14, + "order": 13, "mode": 0, "inputs": [ { @@ -605,184 +446,6 @@ "a - b" ] }, - { - "id": 45, - "type": "VHS_LoadVideo", - "pos": [ - -93, - -153 - ], - "size": [ - 235.1999969482422, - 371.5999984741211 - ], - "flags": {}, - "order": 6, - "mode": 0, - "inputs": [ - { - "name": "meta_batch", - "type": "VHS_BatchManager", - "link": null - }, - { - "name": "vae", - "type": "VAE", - "link": null - }, - { - "name": "frame_load_cap", - "type": "INT", - "link": 176, - "widget": { - "name": "frame_load_cap" - } - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 128 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "frame_count", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "audio", - "type": "VHS_AUDIO", - "links": null, - "shape": 3 - }, - { - "name": "video_info", - "type": "VHS_VIDEOINFO", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "VHS_LoadVideo" - }, - "widgets_values": { - "video": "jeep.mp4", - "force_rate": 0, - "force_size": "Disabled", - "custom_width": 512, - "custom_height": 512, - "frame_load_cap": 20, - "skip_first_frames": 0, - "select_every_nth": 1, - "choose video to upload": "image", - "videopreview": { - "hidden": false, - "paused": false, - "params": { - "frame_load_cap": 20, - "skip_first_frames": 0, - "force_rate": 0, - "filename": "jeep.mp4", - "type": "input", - "format": "video/mp4", - "select_every_nth": 1 - } - } - } - }, - { - "id": 68, - "type": "SimpleMath+", - "pos": [ - -75, - -197 - ], - "size": { - "0": 315, - "1": 78 - }, - "flags": { - "collapsed": true - }, - "order": 5, - "mode": 0, - "inputs": [ - { - "name": "a", - "type": "INT,FLOAT", - "link": 175, - "slot_index": 0 - }, - { - "name": "b", - "type": "INT,FLOAT", - "link": null - } - ], - "outputs": [ - { - "name": "INT", - "type": "INT", - "links": [ - 176 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "FLOAT", - "type": "FLOAT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "SimpleMath+" - }, - "widgets_values": [ - "a + 4" - ] - }, - { - "id": 69, - "type": "INTConstant", - "pos": [ - -90, - -305 - ], - "size": [ - 200, - 58 - ], - "flags": {}, - "order": 2, - "mode": 0, - "outputs": [ - { - "name": "value", - "type": "INT", - "links": [ - 175 - ], - "shape": 3 - } - ], - "title": "Frames to load", - "properties": { - "Node name for S&R": "INTConstant" - }, - "widgets_values": [ - 24 - ], - "color": "#1b4669", - "bgcolor": "#29699c" - }, { "id": 47, "type": "VHS_VideoCombine", @@ -795,7 +458,7 @@ 711.3333333333333 ], "flags": {}, - "order": 17, + "order": 16, "mode": 0, "inputs": [ { @@ -854,6 +517,47 @@ } } }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": [ + 1224, + 737 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 166 + }, + { + "name": "samples", + "type": "LATENT", + "link": 167 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 118 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + } + }, { "id": 30, "type": "CogVideoTextEncode", @@ -890,9 +594,259 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "A high-definition nature video showcasing a vibrant red panda as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The red panda’s fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness, occasionally pausing to drink from the water or look around curiously." + "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness." ] }, + { + "id": 57, + "type": "GetImageSizeAndCount", + "pos": [ + 603, + -65 + ], + "size": { + "0": 202.2143096923828, + "1": 99.23601531982422 + }, + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 126, + "slot_index": 0 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 129, + 136 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "720 width", + "type": "INT", + "links": [ + 165 + ], + "shape": 3, + "slot_index": 1 + }, + { + "name": "480 height", + "type": "INT", + "links": [ + 164 + ], + "shape": 3, + "slot_index": 2 + }, + { + "name": "32 count", + "type": "INT", + "links": [ + 171, + 178, + 181 + ], + "shape": 3, + "slot_index": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + } + }, + { + "id": 45, + "type": "VHS_LoadVideo", + "pos": [ + -93, + -153 + ], + "size": [ + 235.1999969482422, + 359.5999984741211 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null + }, + { + "name": "frame_load_cap", + "type": "INT", + "link": 177, + "widget": { + "name": "frame_load_cap" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 179 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "frame_count", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "audio", + "type": "VHS_AUDIO", + "links": null, + "shape": 3 + }, + { + "name": "video_info", + "type": "VHS_VIDEOINFO", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_LoadVideo" + }, + "widgets_values": { + "video": "jeep.mp4", + "force_rate": 0, + "force_size": "Disabled", + "custom_width": 512, + "custom_height": 512, + "frame_load_cap": 20, + "skip_first_frames": 0, + "select_every_nth": 1, + "choose video to upload": "image", + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "frame_load_cap": 20, + "skip_first_frames": 0, + "force_rate": 0, + "filename": "jeep.mp4", + "type": "input", + "format": "video/mp4", + "select_every_nth": 1 + } + } + } + }, + { + "id": 70, + "type": "GetImageSizeAndCount", + "pos": [ + 214, + -234 + ], + "size": { + "0": 202.2143096923828, + "1": 99.23601531982422 + }, + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 179, + "slot_index": 0 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 180 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "512 width", + "type": "INT", + "links": [], + "shape": 3, + "slot_index": 1 + }, + { + "name": "256 height", + "type": "INT", + "links": [], + "shape": 3, + "slot_index": 2 + }, + { + "name": "32 count", + "type": "INT", + "links": [], + "shape": 3, + "slot_index": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + } + }, + { + "id": 69, + "type": "INTConstant", + "pos": [ + -90, + -305 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 2, + "mode": 0, + "outputs": [ + { + "name": "value", + "type": "INT", + "links": [ + 177 + ], + "shape": 3 + } + ], + "title": "Frames to load", + "properties": { + "Node name for S&R": "INTConstant" + }, + "widgets_values": [ + 32 + ], + "color": "#1b4669", + "bgcolor": "#29699c" + }, { "id": 64, "type": "CogVideoSampler", @@ -902,10 +856,10 @@ ], "size": [ 315, - 342 + 370 ], "flags": {}, - "order": 11, + "order": 10, "mode": 0, "inputs": [ { @@ -947,10 +901,19 @@ { "name": "num_frames", "type": "INT", - "link": 174, + "link": 178, "widget": { "name": "num_frames" } + }, + { + "name": "t_tile_length", + "type": "INT", + "link": 181, + "widget": { + "name": "t_tile_length" + }, + "slot_index": 7 } ], "outputs": [ @@ -979,12 +942,14 @@ 720, 16, 8, - 50, + 25, 9, - 12, + 13, "fixed", - "DPM", - 0.81 + "DDIM", + 32, + 2, + 0.8 ] } ], @@ -1037,14 +1002,6 @@ 0, "IMAGE" ], - [ - 128, - 45, - 0, - 41, - 0, - "IMAGE" - ], [ 129, 57, @@ -1166,35 +1123,43 @@ "LATENT" ], [ - 173, - 57, - 3, - 67, + 177, + 69, 0, - "INT,FLOAT" + 45, + 2, + "INT" ], [ - 174, - 67, - 0, + 178, + 57, + 3, 64, 6, "INT" ], [ - 175, - 69, + 179, + 45, 0, - 68, + 70, 0, - "INT,FLOAT" + "IMAGE" ], [ - 176, - 68, + 180, + 70, 0, - 45, - 2, + 41, + 0, + "IMAGE" + ], + [ + 181, + 57, + 3, + 64, + 7, "INT" ] ], @@ -1204,8 +1169,8 @@ "ds": { "scale": 0.7513148009015777, "offset": [ - 281.39770788130244, - 559.6153930987157 + 177.74090581831425, + 461.56507330501444 ] } }, diff --git a/examples/example_01.json b/examples/example_01.json index 29a854f..a131b8c 100644 --- a/examples/example_01.json +++ b/examples/example_01.json @@ -2,77 +2,6 @@ "last_node_id": 31, "last_link_id": 57, "nodes": [ - { - "id": 22, - "type": "CogVideoSampler", - "pos": [ - 1041, - 342 - ], - "size": { - "0": 315, - "1": 334 - }, - "flags": {}, - "order": 4, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 36 - }, - { - "name": "positive", - "type": "CONDITIONING", - "link": 55, - "slot_index": 1 - }, - { - "name": "negative", - "type": "CONDITIONING", - "link": 57 - }, - { - "name": "samples", - "type": "LATENT", - "link": null - } - ], - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 37 - ], - "shape": 3 - }, - { - "name": "samples", - "type": "LATENT", - "links": [ - 38 - ], - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoSampler" - }, - "widgets_values": [ - 480, - 720, - 16, - 8, - 25, - 6, - 806286757407561, - "fixed", - "DDIM", - 1 - ] - }, { "id": 28, "type": "VHS_VideoCombine", @@ -82,7 +11,7 @@ ], "size": [ 667.752197265625, - 755.8347981770833 + 310 ], "flags": {}, "order": 6, @@ -292,8 +221,8 @@ "id": 11, "type": "CogVideoDecode", "pos": [ - 1138, - 725 + 1140, + 783 ], "size": { "0": 210, @@ -328,6 +257,79 @@ "properties": { "Node name for S&R": "CogVideoDecode" } + }, + { + "id": 22, + "type": "CogVideoSampler", + "pos": [ + 1041, + 342 + ], + "size": { + "0": 315, + "1": 382 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 36 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 55, + "slot_index": 1 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 57 + }, + { + "name": "samples", + "type": "LATENT", + "link": null + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 37 + ], + "shape": 3 + }, + { + "name": "samples", + "type": "LATENT", + "links": [ + 38 + ], + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoSampler" + }, + "widgets_values": [ + 480, + 720, + 16, + 8, + 25, + 6, + 806286757407561, + "fixed", + "DDIM", + 16, + 2, + 1 + ] } ], "links": [ @@ -400,10 +402,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.9090909090909092, + "scale": 0.8264462809917356, "offset": [ - 12.99028921497383, - 38.21608107136124 + 253.92700064075518, + 186.82608107136124 ] } }, diff --git a/nodes.py b/nodes.py index dc3bf73..7bd8186 100644 --- a/nodes.py +++ b/nodes.py @@ -153,17 +153,17 @@ class CogVideoImageEncode: vae = pipeline["pipe"].vae vae.to(device) - image = image * 2.0 - 1.0 - image = image.to(vae.dtype).to(device) - image = image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W - B, C, T, H, W = image.shape + input_image = image.clone() * 2.0 - 1.0 + input_image = input_image.to(vae.dtype).to(device) + input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W + B, C, T, H, W = input_image.shape chunk_size = 16 latents_list = [] # Loop through the temporal dimension in chunks of 16 for i in range(0, T, chunk_size): # Get the chunk of 16 frames (or remaining frames if less than 16 are left) end_index = min(i + chunk_size, T) - image_chunk = image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W] + image_chunk = input_image[:, :, i:end_index, :, :] # Shape: [B, C, chunk_size, H, W] # Encode the chunk of images latents = vae.encode(image_chunk) @@ -179,6 +179,7 @@ class CogVideoImageEncode: latents = vae.config.scaling_factor * latents latents = latents.permute(0, 2, 1, 3, 4) # B, T_chunk, C, H, W latents_list.append(latents) + vae.clear_fake_context_parallel_cache() # Concatenate all the chunks along the temporal dimension final_latents = torch.cat(latents_list, dim=1) @@ -198,12 +199,14 @@ class CogVideoSampler: "negative": ("CONDITIONING", ), "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}), "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}), - "num_frames": ("INT", {"default": 48, "min": 8, "max": 100, "step": 8}), + "num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}), "fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}), "steps": ("INT", {"default": 25, "min": 1}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), "scheduler": (["DDIM", "DPM"],), + "t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}), + "t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}), }, "optional": { "samples": ("LATENT", ), @@ -216,14 +219,20 @@ class CogVideoSampler: FUNCTION = "process" CATEGORY = "CogVideoWrapper" - def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0): + def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0): mm.soft_empty_cache() + + assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap" + assert t_tile_length <= num_frames, "t_tile_length must be equal or less than num_frames" + t_tile_length = t_tile_length // 4 + t_tile_overlap = t_tile_overlap // 4 + device = mm.get_torch_device() offload_device = mm.unet_offload_device() pipe = pipeline["pipe"] dtype = pipeline["dtype"] base_path = pipeline["base_path"] - + pipe.transformer.to(device) generator = torch.Generator(device=device).manual_seed(seed) @@ -237,6 +246,8 @@ class CogVideoSampler: height = height, width = width, num_frames = num_frames, + t_tile_length = t_tile_length, + t_tile_overlap = t_tile_overlap, fps = fps, guidance_scale=cfg, latents=samples["samples"] if samples is not None else None, diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index f880b0e..b36846a 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -218,6 +218,16 @@ class CogVideoXPipeline(DiffusionPipeline): self.scheduler.set_begin_index(t_start * self.scheduler.order) return timesteps.to(device), num_inference_steps - t_start + + def _gaussian_weights(self, t_tile_length, t_batch_size): + from numpy import pi, exp, sqrt + + var = 0.01 + midpoint = (t_tile_length - 1) / 2 # -1 because index goes from 0 to latent_width - 1 + t_probs = [exp(-(t-midpoint)*(t-midpoint)/(t_tile_length*t_tile_length)/(2*var)) / sqrt(2*pi*var) for t in range(t_tile_length)] + weights = torch.tensor(t_probs) + weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1) + return weights @property def guidance_scale(self): @@ -244,6 +254,8 @@ class CogVideoXPipeline(DiffusionPipeline): height: int = 480, width: int = 720, num_frames: int = 48, + t_tile_length: int = 12, + t_tile_overlap: int = 4, fps: int = 8, num_inference_steps: int = 50, timesteps: Optional[List[int]] = None, @@ -301,9 +313,9 @@ class CogVideoXPipeline(DiffusionPipeline): argument. """ - assert ( - num_frames <= 48 and num_frames % fps == 0 and fps == 8 - ), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX." + #assert ( + # num_frames <= 48 and num_frames % fps == 0 and fps == 8 + #), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX." height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial @@ -337,7 +349,10 @@ class CogVideoXPipeline(DiffusionPipeline): # 5. Prepare latents. latent_channels = self.transformer.config.in_channels - num_frames += 1 + + if latents is None and num_frames == t_tile_length: + num_frames += 1 + latents, timesteps = self.prepare_latents( batch_size * num_videos_per_prompt, latent_channels, @@ -356,6 +371,9 @@ class CogVideoXPipeline(DiffusionPipeline): # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype) + print("latents.shape", latents.shape) + print("latents.device", latents.device) # 7. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) comfy_pbar = ProgressBar(num_inference_steps) @@ -365,45 +383,90 @@ class CogVideoXPipeline(DiffusionPipeline): for i, t in enumerate(timesteps): if self.interrupt: continue + + #temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py + # ===================================================== + grid_ts = 0 + cur_t = 0 + while cur_t < latents.shape[1]: + cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length + grid_ts += 1 - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + all_t = latents.shape[1] + latents_all_list = [] + # ===================================================== - # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timestep = t.expand(latent_model_input.shape[0]) + for t_i in range(grid_ts): + if t_i < grid_ts - 1: + ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0) + if t_i == grid_ts - 1: + ofs_t = all_t - t_tile_length - # predict noise model_output - noise_pred = self.transformer( - hidden_states=latent_model_input, - encoder_hidden_states=prompt_embeds, - timestep=timestep, - return_dict=False, - )[0] - noise_pred = noise_pred.float() + input_start_t = ofs_t + input_end_t = ofs_t + t_tile_length - # perform guidance - # self._guidance_scale = 1 + guidance_scale * ( - # (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2 - # ) - # print(self._guidance_scale) - if self.do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + #latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + #latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - # compute the previous noisy sample x_t -> x_t-1 - if not isinstance(self.scheduler, CogVideoXDPMScheduler): - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - else: - latents, old_pred_original_sample = self.scheduler.step( - noise_pred, - old_pred_original_sample, - t, - timesteps[i - 1] if i > 0 else None, - latents, - **extra_step_kwargs, + latents_tile = latents[:, input_start_t:input_end_t,:, :, :] + latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile + latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t) + + #t_input = t[None].to(device) + t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + + # predict noise model_output + noise_pred = self.transformer( + hidden_states=latent_model_input_tile, + encoder_hidden_states=prompt_embeds, + timestep=t_input, return_dict=False, - ) - latents = latents.to(prompt_embeds.dtype) + )[0] + noise_pred = noise_pred.float() + + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + if not isinstance(self.scheduler, CogVideoXDPMScheduler): + latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0] + else: + raise NotImplementedError("DPM is not supported with temporal tiling") + # else: + # latents_tile, old_pred_original_sample = self.scheduler.step( + # noise_pred, + # old_pred_original_sample, + # t, + # t_input[t_i - 1] if t_i > 0 else None, + # latents_tile, + # **extra_step_kwargs, + # return_dict=False, + # ) + + latents_all_list.append(latents_tile) + + # ========================================== + latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype) + contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype) + # Add each tile contribution to overall latents + for t_i in range(grid_ts): + if t_i < grid_ts - 1: + ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0) + if t_i == grid_ts - 1: + ofs_t = all_t - t_tile_length + + input_start_t = ofs_t + input_end_t = ofs_t + t_tile_length + + latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights + contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights + + latents_all /= contributors + + latents = latents_all + # ========================================== + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update()