From 2ae70dd82eb133551ba43bcb98b3a5683d832238 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Wed, 7 Aug 2024 02:11:37 +0300 Subject: [PATCH] cleanup, fix vid2vid --- .../cogvideo_vid2vid_test_example_01.json | 1279 +++++++++-------- nodes.py | 7 +- pipeline_cogvideox.py | 117 +- 3 files changed, 700 insertions(+), 703 deletions(-) diff --git a/examples/cogvideo_vid2vid_test_example_01.json b/examples/cogvideo_vid2vid_test_example_01.json index 93b1532..fb4b6e5 100644 --- a/examples/cogvideo_vid2vid_test_example_01.json +++ b/examples/cogvideo_vid2vid_test_example_01.json @@ -1,40 +1,7 @@ { - "last_node_id": 64, - "last_link_id": 167, + "last_node_id": 69, + "last_link_id": 176, "nodes": [ - { - "id": 1, - "type": "DownloadAndLoadCogVideoModel", - "pos": [ - 649, - 182 - ], - "size": { - "0": 315, - "1": 58 - }, - "flags": {}, - "order": 0, - "mode": 0, - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 83, - 159 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoModel" - }, - "widgets_values": [ - "fp16" - ] - }, { "id": 20, "type": "CLIPLoader", @@ -47,7 +14,7 @@ "1": 82 }, "flags": {}, - "order": 1, + "order": 0, "mode": 0, "outputs": [ { @@ -69,100 +36,6 @@ "sd3" ] }, - { - "id": 56, - "type": "SimpleMath+", - "pos": [ - 1413, - 122 - ], - "size": { - "0": 315, - "1": 78 - }, - "flags": {}, - "order": 11, - "mode": 0, - "inputs": [ - { - "name": "a", - "type": "INT,FLOAT", - "link": 121 - }, - { - "name": "b", - "type": "INT,FLOAT", - "link": null - } - ], - "outputs": [ - { - "name": "INT", - "type": "INT", - "links": [ - 120, - 133 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "FLOAT", - "type": "FLOAT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "SimpleMath+" - }, - "widgets_values": [ - "a - 4" - ] - }, - { - "id": 37, - "type": "CogVideoImageEncode", - "pos": [ - 921, - 12 - ], - "size": { - "0": 210, - "1": 46 - }, - "flags": {}, - "order": 7, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 83, - "slot_index": 0 - }, - { - "name": "image", - "type": "IMAGE", - "link": 129, - "slot_index": 1 - } - ], - "outputs": [ - { - "name": "samples", - "type": "LATENT", - "links": [ - 162 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoImageEncode" - } - }, { "id": 55, "type": "GetImageSizeAndCount", @@ -175,7 +48,7 @@ "1": 86 }, "flags": {}, - "order": 10, + "order": 13, "mode": 0, "inputs": [ { @@ -190,7 +63,7 @@ "name": "image", "type": "IMAGE", "links": [ - 119 + 170 ], "shape": 3, "slot_index": 0 @@ -208,7 +81,7 @@ "shape": 3 }, { - "name": "17 count", + "name": "25 count", "type": "INT", "links": [ 121 @@ -222,323 +95,131 @@ } }, { - "id": 59, - "type": "GetImageRangeFromBatch", + "id": 31, + "type": "CogVideoTextEncode", "pos": [ - 1445, - 410 + 503, + 521 ], "size": { - "0": 315, - "1": 102 + "0": 463.01251220703125, + "1": 98.10446166992188 }, - "flags": { - "collapsed": true - }, - "order": 13, + "flags": {}, + "order": 4, "mode": 0, "inputs": [ { - "name": "images", - "type": "IMAGE", - "link": 136, - "slot_index": 0 - }, - { - "name": "masks", - "type": "MASK", - "link": null - }, - { - "name": "num_frames", - "type": "INT", - "link": 133, - "widget": { - "name": "num_frames" - } + "name": "clip", + "type": "CLIP", + "link": 56 } ], "outputs": [ { - "name": "IMAGE", - "type": "IMAGE", + "name": "conditioning", + "type": "CONDITIONING", "links": [ - 134 + 161 ], "shape": 3, "slot_index": 0 - }, - { - "name": "MASK", - "type": "MASK", - "links": null, - "shape": 3 } ], "properties": { - "Node name for S&R": "GetImageRangeFromBatch" + "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - 0, - 29 + "bad quality video, blurry, messy" ] }, { - "id": 53, - "type": "GetImageRangeFromBatch", + "id": 1, + "type": "DownloadAndLoadCogVideoModel", "pos": [ - 1451, - 368 + 649, + 182 ], "size": { "0": 315, - "1": 102 + "1": 58 }, - "flags": { - "collapsed": true + "flags": {}, + "order": 1, + "mode": 0, + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 83, + 159 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" }, + "widgets_values": [ + "fp16" + ] + }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": [ + 1201, + 684 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, "order": 12, "mode": 0, "inputs": [ { - "name": "images", - "type": "IMAGE", - "link": 119, - "slot_index": 0 + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 166 }, { - "name": "masks", - "type": "MASK", - "link": null - }, - { - "name": "num_frames", - "type": "INT", - "link": 120, - "widget": { - "name": "num_frames" - } + "name": "samples", + "type": "LATENT", + "link": 167 } ], "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 135 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "MASK", - "type": "MASK", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "GetImageRangeFromBatch" - }, - "widgets_values": [ - 0, - 29 - ] - }, - { - "id": 58, - "type": "ImageConcanate", - "pos": [ - 1448, - 465 - ], - "size": { - "0": 315, - "1": 102 - }, - "flags": {}, - "order": 14, - "mode": 0, - "inputs": [ - { - "name": "image1", - "type": "IMAGE", - "link": 134 - }, - { - "name": "image2", - "type": "IMAGE", - "link": 135 - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 132 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "ImageConcanate" - }, - "widgets_values": [ - "right", - false - ] - }, - { - "id": 47, - "type": "VHS_VideoCombine", - "pos": [ - 1790, - -104 - ], - "size": [ - 1110, - 711.3333333333333 - ], - "flags": {}, - "order": 15, - "mode": 0, - "inputs": [ { "name": "images", "type": "IMAGE", - "link": 132 - }, - { - "name": "audio", - "type": "VHS_AUDIO", - "link": null - }, - { - "name": "meta_batch", - "type": "VHS_BatchManager", - "link": null - }, - { - "name": "vae", - "type": "VAE", - "link": null - } - ], - "outputs": [ - { - "name": "Filenames", - "type": "VHS_FILENAMES", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "VHS_VideoCombine" - }, - "widgets_values": { - "frame_rate": 8, - "loop_count": 0, - "filename_prefix": "AnimateDiff", - "format": "video/nvenc_h264-mp4", - "pix_fmt": "yuv420p", - "bitrate": 10, - "megabit": true, - "save_metadata": true, - "pingpong": false, - "save_output": false, - "videopreview": { - "hidden": false, - "paused": false, - "params": { - "filename": "AnimateDiff_00008.mp4", - "subfolder": "", - "type": "temp", - "format": "video/nvenc_h264-mp4", - "frame_rate": 8 - } - } - } - }, - { - "id": 57, - "type": "GetImageSizeAndCount", - "pos": [ - 674, - 2 - ], - "size": { - "0": 210, - "1": 86 - }, - "flags": {}, - "order": 6, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 126, - "slot_index": 0 - } - ], - "outputs": [ - { - "name": "image", - "type": "IMAGE", "links": [ - 129, - 136 + 118 ], "shape": 3, "slot_index": 0 - }, - { - "name": "720 width", - "type": "INT", - "links": [ - 165 - ], - "shape": 3, - "slot_index": 1 - }, - { - "name": "480 height", - "type": "INT", - "links": [ - 164 - ], - "shape": 3, - "slot_index": 2 - }, - { - "name": "16 count", - "type": "INT", - "links": [ - 163 - ], - "shape": 3, - "slot_index": 3 } ], "properties": { - "Node name for S&R": "GetImageSizeAndCount" + "Node name for S&R": "CogVideoDecode" } }, { "id": 41, "type": "ImageResizeKJ", "pos": [ - 315, - -19 + 206, + -69 ], "size": { "0": 315, "1": 242 }, "flags": {}, - "order": 5, + "order": 7, "mode": 0, "inputs": [ { @@ -605,11 +286,11 @@ ] }, { - "id": 11, - "type": "CogVideoDecode", + "id": 37, + "type": "CogVideoImageEncode", "pos": [ - 1201, - 684 + 939, + -53 ], "size": { "0": 210, @@ -622,27 +303,555 @@ { "name": "pipeline", "type": "COGVIDEOPIPE", - "link": 166 + "link": 83, + "slot_index": 0 }, { - "name": "samples", - "type": "LATENT", - "link": 167 + "name": "image", + "type": "IMAGE", + "link": 129, + "slot_index": 1 } ], "outputs": [ { - "name": "images", - "type": "IMAGE", + "name": "samples", + "type": "LATENT", "links": [ - 118 + 172 ], "shape": 3, "slot_index": 0 } ], "properties": { - "Node name for S&R": "CogVideoDecode" + "Node name for S&R": "CogVideoImageEncode" + } + }, + { + "id": 57, + "type": "GetImageSizeAndCount", + "pos": [ + 603, + -65 + ], + "size": [ + 202.21431350127853, + 99.2360176040001 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 126, + "slot_index": 0 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 129, + 136 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "720 width", + "type": "INT", + "links": [ + 165 + ], + "shape": 3, + "slot_index": 1 + }, + { + "name": "480 height", + "type": "INT", + "links": [ + 164 + ], + "shape": 3, + "slot_index": 2 + }, + { + "name": "28 count", + "type": "INT", + "links": [ + 171, + 173 + ], + "shape": 3, + "slot_index": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + } + }, + { + "id": 67, + "type": "SimpleMath+", + "pos": [ + 665, + 98 + ], + "size": { + "0": 315, + "1": 78 + }, + "flags": { + "collapsed": true + }, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "a", + "type": "INT,FLOAT", + "link": 173 + }, + { + "name": "b", + "type": "INT,FLOAT", + "link": null + } + ], + "outputs": [ + { + "name": "INT", + "type": "INT", + "links": [ + 174 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "FLOAT", + "type": "FLOAT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "SimpleMath+" + }, + "widgets_values": [ + "a - 4" + ] + }, + { + "id": 59, + "type": "GetImageRangeFromBatch", + "pos": [ + 1459, + 413 + ], + "size": { + "0": 315, + "1": 102 + }, + "flags": { + "collapsed": true + }, + "order": 15, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 136, + "slot_index": 0 + }, + { + "name": "masks", + "type": "MASK", + "link": null + }, + { + "name": "num_frames", + "type": "INT", + "link": 133, + "widget": { + "name": "num_frames" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 134 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageRangeFromBatch" + }, + "widgets_values": [ + 0, + 29 + ] + }, + { + "id": 58, + "type": "ImageConcanate", + "pos": [ + 1451, + 476 + ], + "size": { + "0": 315, + "1": 102 + }, + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "name": "image1", + "type": "IMAGE", + "link": 134 + }, + { + "name": "image2", + "type": "IMAGE", + "link": 170 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 132 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ImageConcanate" + }, + "widgets_values": [ + "right", + false + ] + }, + { + "id": 56, + "type": "SimpleMath+", + "pos": [ + 1454, + 366 + ], + "size": { + "0": 315, + "1": 78 + }, + "flags": { + "collapsed": true + }, + "order": 14, + "mode": 0, + "inputs": [ + { + "name": "a", + "type": "INT,FLOAT", + "link": 121 + }, + { + "name": "b", + "type": "INT,FLOAT", + "link": 171 + } + ], + "outputs": [ + { + "name": "INT", + "type": "INT", + "links": [ + 133 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "FLOAT", + "type": "FLOAT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "SimpleMath+" + }, + "widgets_values": [ + "a - b" + ] + }, + { + "id": 45, + "type": "VHS_LoadVideo", + "pos": [ + -93, + -153 + ], + "size": [ + 235.1999969482422, + 371.5999984741211 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null + }, + { + "name": "frame_load_cap", + "type": "INT", + "link": 176, + "widget": { + "name": "frame_load_cap" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 128 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "frame_count", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "audio", + "type": "VHS_AUDIO", + "links": null, + "shape": 3 + }, + { + "name": "video_info", + "type": "VHS_VIDEOINFO", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_LoadVideo" + }, + "widgets_values": { + "video": "jeep.mp4", + "force_rate": 0, + "force_size": "Disabled", + "custom_width": 512, + "custom_height": 512, + "frame_load_cap": 20, + "skip_first_frames": 0, + "select_every_nth": 1, + "choose video to upload": "image", + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "frame_load_cap": 20, + "skip_first_frames": 0, + "force_rate": 0, + "filename": "jeep.mp4", + "type": "input", + "format": "video/mp4", + "select_every_nth": 1 + } + } + } + }, + { + "id": 68, + "type": "SimpleMath+", + "pos": [ + -75, + -197 + ], + "size": { + "0": 315, + "1": 78 + }, + "flags": { + "collapsed": true + }, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "a", + "type": "INT,FLOAT", + "link": 175, + "slot_index": 0 + }, + { + "name": "b", + "type": "INT,FLOAT", + "link": null + } + ], + "outputs": [ + { + "name": "INT", + "type": "INT", + "links": [ + 176 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "FLOAT", + "type": "FLOAT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "SimpleMath+" + }, + "widgets_values": [ + "a + 4" + ] + }, + { + "id": 69, + "type": "INTConstant", + "pos": [ + -90, + -305 + ], + "size": [ + 200, + 58 + ], + "flags": {}, + "order": 2, + "mode": 0, + "outputs": [ + { + "name": "value", + "type": "INT", + "links": [ + 175 + ], + "shape": 3 + } + ], + "title": "Frames to load", + "properties": { + "Node name for S&R": "INTConstant" + }, + "widgets_values": [ + 24 + ], + "color": "#1b4669", + "bgcolor": "#29699c" + }, + { + "id": 47, + "type": "VHS_VideoCombine", + "pos": [ + 1463, + -407 + ], + "size": [ + 1110, + 711.3333333333333 + ], + "flags": {}, + "order": 17, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 132 + }, + { + "name": "audio", + "type": "VHS_AUDIO", + "link": null + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 8, + "loop_count": 0, + "filename_prefix": "AnimateDiff", + "format": "video/nvenc_h264-mp4", + "pix_fmt": "yuv420p", + "bitrate": 10, + "megabit": true, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "AnimateDiff_00007.mp4", + "subfolder": "", + "type": "temp", + "format": "video/nvenc_h264-mp4", + "frame_rate": 8 + } + } } }, { @@ -681,131 +890,9 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "cinematic video of a red panda turning it's head" + "A high-definition nature video showcasing a vibrant red panda as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The red panda’s fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness, occasionally pausing to drink from the water or look around curiously." ] }, - { - "id": 31, - "type": "CogVideoTextEncode", - "pos": [ - 503, - 521 - ], - "size": { - "0": 463.01251220703125, - "1": 98.10446166992188 - }, - "flags": {}, - "order": 4, - "mode": 0, - "inputs": [ - { - "name": "clip", - "type": "CLIP", - "link": 56 - } - ], - "outputs": [ - { - "name": "conditioning", - "type": "CONDITIONING", - "links": [ - 161 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoTextEncode" - }, - "widgets_values": [ - "bad quality video, blurry, messy" - ] - }, - { - "id": 45, - "type": "VHS_LoadVideo", - "pos": [ - 21, - -148 - ], - "size": [ - 235.1999969482422, - 491.1999969482422 - ], - "flags": {}, - "order": 2, - "mode": 0, - "inputs": [ - { - "name": "meta_batch", - "type": "VHS_BatchManager", - "link": null - }, - { - "name": "vae", - "type": "VAE", - "link": null - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 128 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "frame_count", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "audio", - "type": "VHS_AUDIO", - "links": null, - "shape": 3 - }, - { - "name": "video_info", - "type": "VHS_VIDEOINFO", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "VHS_LoadVideo" - }, - "widgets_values": { - "video": "wolf_source-new.mp4", - "force_rate": 0, - "force_size": "Disabled", - "custom_width": 512, - "custom_height": 512, - "frame_load_cap": 16, - "skip_first_frames": 0, - "select_every_nth": 1, - "choose video to upload": "image", - "videopreview": { - "hidden": false, - "paused": false, - "params": { - "frame_load_cap": 16, - "skip_first_frames": 0, - "force_rate": 0, - "filename": "wolf_source-new.mp4", - "type": "input", - "format": "video/mp4", - "select_every_nth": 1 - } - } - } - }, { "id": 64, "type": "CogVideoSampler", @@ -813,12 +900,12 @@ 1090, 290 ], - "size": { - "0": 315, - "1": 342 - }, + "size": [ + 315, + 342 + ], "flags": {}, - "order": 8, + "order": 11, "mode": 0, "inputs": [ { @@ -839,15 +926,7 @@ { "name": "samples", "type": "LATENT", - "link": 162 - }, - { - "name": "num_frames", - "type": "INT", - "link": 163, - "widget": { - "name": "num_frames" - } + "link": 172 }, { "name": "height", @@ -864,6 +943,14 @@ "widget": { "name": "width" } + }, + { + "name": "num_frames", + "type": "INT", + "link": 174, + "widget": { + "name": "num_frames" + } } ], "outputs": [ @@ -890,14 +977,14 @@ "widgets_values": [ 480, 720, - 48, + 16, 8, - 35, + 50, 9, - 6, + 12, "fixed", "DPM", - 0.7000000000000001 + 0.81 ] } ], @@ -934,22 +1021,6 @@ 0, "IMAGE" ], - [ - 119, - 55, - 0, - 53, - 0, - "IMAGE" - ], - [ - 120, - 56, - 0, - 53, - 2, - "INT" - ], [ 121, 55, @@ -1006,14 +1077,6 @@ 0, "IMAGE" ], - [ - 135, - 53, - 0, - 58, - 1, - "IMAGE" - ], [ 136, 57, @@ -1046,28 +1109,12 @@ 2, "CONDITIONING" ], - [ - 162, - 37, - 0, - 64, - 3, - "LATENT" - ], - [ - 163, - 57, - 3, - 64, - 4, - "INT" - ], [ 164, 57, 2, 64, - 5, + 4, "INT" ], [ @@ -1075,7 +1122,7 @@ 57, 1, 64, - 6, + 5, "INT" ], [ @@ -1093,16 +1140,72 @@ 11, 1, "LATENT" + ], + [ + 170, + 55, + 0, + 58, + 1, + "IMAGE" + ], + [ + 171, + 57, + 3, + 56, + 1, + "INT,FLOAT" + ], + [ + 172, + 37, + 0, + 64, + 3, + "LATENT" + ], + [ + 173, + 57, + 3, + 67, + 0, + "INT,FLOAT" + ], + [ + 174, + 67, + 0, + 64, + 6, + "INT" + ], + [ + 175, + 69, + 0, + 68, + 0, + "INT,FLOAT" + ], + [ + 176, + 68, + 0, + 45, + 2, + "INT" ] ], "groups": [], "config": {}, "extra": { "ds": { - "scale": 0.6830134553650705, + "scale": 0.7513148009015777, "offset": [ - 56.628416841109384, - 394.7727729054069 + 281.39770788130244, + 559.6153930987157 ] } }, diff --git a/nodes.py b/nodes.py index f562580..9239618 100644 --- a/nodes.py +++ b/nodes.py @@ -3,6 +3,7 @@ import torch import folder_paths import comfy.model_management as mm from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler +from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel from .pipeline_cogvideox import CogVideoXPipeline import logging @@ -52,8 +53,11 @@ class DownloadAndLoadCogVideoModel: local_dir=base_path, local_dir_use_symlinks=False, ) + transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(dtype).to(offload_device) + vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device) + scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler") - pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device) + pipe = CogVideoXPipeline(vae, transformer, scheduler) pipeline = { "pipe": pipe, @@ -239,7 +243,6 @@ class CogVideoSampler: prompt_embeds=positive.to(dtype).to(device), negative_prompt_embeds=negative.to(dtype).to(device), generator=generator, - output_type="latents", device=device ) pipe.transformer.to(offload_device) diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index 7faa0d3..f880b0e 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -14,16 +14,14 @@ # limitations under the License. import inspect -from dataclasses import dataclass from typing import Callable, Dict, List, Optional, Tuple, Union import torch -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler -from diffusers.utils import BaseOutput, logging, replace_example_docstring +from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor from diffusers.video_processor import VideoProcessor @@ -31,30 +29,6 @@ from comfy.utils import ProgressBar logger = logging.get_logger(__name__) # pylint: disable=invalid-name - -EXAMPLE_DOC_STRING = """ - Examples: - ```python - >>> from diffusers import CogVideoXPipeline - >>> from diffusers.utils import export_to_video - - >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.bfloat16).to("cuda") - >>> prompt = ( - ... "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. " - ... "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other " - ... "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, " - ... "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. " - ... "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical " - ... "atmosphere of this unique musical performance." - ... ) - >>> video = pipe( - ... "a polar bear dancing, high quality, realistic", guidance_scale=6, num_inference_steps=20 - ... ).frames[0] - >>> export_to_video(video, "output.mp4", fps=8) - ``` -""" - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, @@ -114,22 +88,6 @@ def retrieve_timesteps( timesteps = scheduler.timesteps return timesteps, num_inference_steps - -@dataclass -class CogVideoXPipelineOutput(BaseOutput): - r""" - Output class for CogVideo pipelines. - - Args: - frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing - denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape - `(batch_size, num_frames, channels, height, width)`. - """ - - frames: torch.Tensor - - class CogVideoXPipeline(DiffusionPipeline): r""" Pipeline for text-to-video generation using CogVideoX. @@ -156,12 +114,6 @@ class CogVideoXPipeline(DiffusionPipeline): _optional_components = ["tokenizer", "text_encoder"] model_cpu_offload_seq = "text_encoder->transformer->vae" - _callback_tensor_inputs = [ - "latents", - "prompt_embeds", - "negative_prompt_embeds", - ] - def __init__( self, vae: AutoencoderKLCogVideoX, @@ -199,9 +151,7 @@ class CogVideoXPipeline(DiffusionPipeline): ) if latents is None: - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - # scale the initial noise by the standard deviation required by the scheduler - + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) else: latents = latents.to(device) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device) @@ -219,7 +169,7 @@ class CogVideoXPipeline(DiffusionPipeline): latents = latents[:, :frames_needed, :, :, :] latents = self.scheduler.add_noise(latents, noise, latent_timestep) - latents = latents * self.scheduler.init_noise_sigma + latents = latents * self.scheduler.init_noise_sigma # scale the initial noise by the standard deviation required by the scheduler return latents, timesteps # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -245,20 +195,12 @@ class CogVideoXPipeline(DiffusionPipeline): self, height, width, - callback_on_step_end_tensor_inputs, prompt_embeds=None, negative_prompt_embeds=None, ): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: raise ValueError( @@ -297,7 +239,6 @@ class CogVideoXPipeline(DiffusionPipeline): return self._interrupt @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, height: int = 480, @@ -314,25 +255,12 @@ class CogVideoXPipeline(DiffusionPipeline): latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, - output_type: str = "pil", - return_dict: bool = True, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], device = torch.device("cuda"), - ) -> Union[CogVideoXPipelineOutput, Tuple]: + ): """ Function invoked when calling the pipeline for generation. Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -371,37 +299,12 @@ class CogVideoXPipeline(DiffusionPipeline): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead - of a plain tuple. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - - Examples: - - Returns: - [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] or `tuple`: - [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is a list with the generated images. """ assert ( num_frames <= 48 and num_frames % fps == 0 and fps == 8 ), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX." - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial num_videos_per_prompt = 1 @@ -410,7 +313,6 @@ class CogVideoXPipeline(DiffusionPipeline): self.check_inputs( height, width, - callback_on_step_end_tensor_inputs, prompt_embeds, negative_prompt_embeds, ) @@ -503,17 +405,6 @@ class CogVideoXPipeline(DiffusionPipeline): ) latents = latents.to(prompt_embeds.dtype) - # call the callback, if provided - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() comfy_pbar.update(1)