diff --git a/examples/cogvideo_2b_long_02.json b/examples/cogvideo_2b_temporal_tiling_long_01.json similarity index 80% rename from examples/cogvideo_2b_long_02.json rename to examples/cogvideo_2b_temporal_tiling_long_01.json index 4f91be9..f0ca3c8 100644 --- a/examples/cogvideo_2b_long_02.json +++ b/examples/cogvideo_2b_temporal_tiling_long_01.json @@ -5,10 +5,18 @@ { "id": 30, "type": "CogVideoTextEncode", - "pos": [ - 500, - 308 - ], + "pos": { + "0": 500, + "1": 308, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 474.8450012207031, "1": 164.7423553466797 @@ -44,10 +52,18 @@ { "id": 20, "type": "CLIPLoader", - "pos": [ - -59, - 397 - ], + "pos": { + "0": -59, + "1": 397, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 451.30548095703125, "1": 82 @@ -55,6 +71,7 @@ "flags": {}, "order": 0, "mode": 0, + "inputs": [], "outputs": [ { "name": "CLIP", @@ -78,10 +95,18 @@ { "id": 31, "type": "CogVideoTextEncode", - "pos": [ - 503, - 521 - ], + "pos": { + "0": 503, + "1": 521, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 463.01251220703125, "1": 98.10446166992188 @@ -115,110 +140,144 @@ ] }, { - "id": 11, - "type": "CogVideoDecode", - "pos": [ - 1140, - 783 - ], - "size": { - "0": 210, - "1": 78 + "id": 32, + "type": "VHS_VideoCombine", + "pos": { + "0": 1439, + "1": 122, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 }, + "size": [ + 563.3333740234375, + 688.2124814882384 + ], "flags": {}, - "order": 5, + "order": 7, "mode": 0, "inputs": [ { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 37 + "name": "images", + "type": "IMAGE", + "link": 60, + "slot_index": 0 }, { - "name": "samples", - "type": "LATENT", - "link": 38 + "name": "audio", + "type": "VHS_AUDIO", + "link": null + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null } ], "outputs": [ { - "name": "images", - "type": "IMAGE", + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 8, + "loop_count": 0, + "filename_prefix": "CogVideo2B_long", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "CogVideo2B_long_00001.mp4", + "subfolder": "", + "type": "temp", + "format": "video/h264-mp4", + "frame_rate": 8 + } + } + } + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": { + "0": 653, + "1": 90, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 315, + "1": 154 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", "links": [ - 59 + 36 ], "slot_index": 0, "shape": 3 } ], "properties": { - "Node name for S&R": "CogVideoDecode" + "Node name for S&R": "DownloadAndLoadCogVideoModel" }, "widgets_values": [ + "THUDM/CogVideoX-2b", + "fp16", + "disabled", + "disabled", false ] }, - { - "id": 33, - "type": "GetImageSizeAndCount", - "pos": [ - 1189, - 134 - ], - "size": { - "0": 210, - "1": 86 - }, - "flags": {}, - "order": 6, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 59 - } - ], - "outputs": [ - { - "name": "image", - "type": "IMAGE", - "links": [ - 60 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "720 width", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "480 height", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "32 count", - "type": "INT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "GetImageSizeAndCount" - } - }, { "id": 22, "type": "CogVideoSampler", - "pos": [ - 1041, - 342 - ], + "pos": { + "0": 1041, + "1": 342, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 315, "1": 382 @@ -278,114 +337,129 @@ 6, 6, 806286757407563, - "DDIM", + "DDIM_tiled", 48, 8, 1 ] }, { - "id": 1, - "type": "DownloadAndLoadCogVideoModel", - "pos": [ - 649, - 182 - ], - "size": { - "0": 315, - "1": 82 + "id": 11, + "type": "CogVideoDecode", + "pos": { + "0": 1049, + "1": 772, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 }, + "size": [ + 295.70112532900725, + 198 + ], "flags": {}, - "order": 1, + "order": 5, "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 37 + }, + { + "name": "samples", + "type": "LATENT", + "link": 38 + } + ], "outputs": [ { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", + "name": "images", + "type": "IMAGE", "links": [ - 36 + 59 ], "slot_index": 0, "shape": 3 } ], "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoModel" + "Node name for S&R": "CogVideoDecode" }, "widgets_values": [ - "THUDM/CogVideoX-2b", - "fp16" + true, + 96, + 96, + 0.083, + 0.083, + true ] }, { - "id": 32, - "type": "VHS_VideoCombine", - "pos": [ - 1439, - 122 - ], - "size": [ - 563.3333740234375, - 310 - ], + "id": 33, + "type": "GetImageSizeAndCount", + "pos": { + "0": 1176, + "1": 122, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 210, + "1": 86 + }, "flags": {}, - "order": 7, + "order": 6, "mode": 0, "inputs": [ { - "name": "images", + "name": "image", "type": "IMAGE", - "link": 60, - "slot_index": 0 - }, - { - "name": "audio", - "type": "VHS_AUDIO", - "link": null - }, - { - "name": "meta_batch", - "type": "VHS_BatchManager", - "link": null - }, - { - "name": "vae", - "type": "VAE", - "link": null + "link": 59 } ], "outputs": [ { - "name": "Filenames", - "type": "VHS_FILENAMES", + "name": "image", + "type": "IMAGE", + "links": [ + 60 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "728 width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "485 height", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "96 count", + "type": "INT", "links": null, "shape": 3 } ], "properties": { - "Node name for S&R": "VHS_VideoCombine" - }, - "widgets_values": { - "frame_rate": 8, - "loop_count": 0, - "filename_prefix": "CogVideo2B_long", - "format": "video/h264-mp4", - "pix_fmt": "yuv420p", - "crf": 19, - "save_metadata": true, - "pingpong": false, - "save_output": false, - "videopreview": { - "hidden": false, - "paused": false, - "params": { - "filename": "AnimateDiff_00001.mp4", - "subfolder": "", - "type": "temp", - "format": "video/h264-mp4", - "frame_rate": 8 - } - } + "Node name for S&R": "GetImageSizeAndCount" } } ], @@ -467,10 +541,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.8264462809917354, + "scale": 0.7513148009015777, "offset": [ - 86.92928825501215, - 77.5537144406024 + 253.3863163213836, + 255.76127216744268 ] } }, diff --git a/examples/cogvideo_2b_vid2vid_test_example_02.json b/examples/cogvideo_5b_vid2vid_example_01.json similarity index 83% rename from examples/cogvideo_2b_vid2vid_test_example_02.json rename to examples/cogvideo_5b_vid2vid_example_01.json index f78505c..4aba556 100644 --- a/examples/cogvideo_2b_vid2vid_test_example_02.json +++ b/examples/cogvideo_5b_vid2vid_example_01.json @@ -5,10 +5,18 @@ { "id": 20, "type": "CLIPLoader", - "pos": [ - -29, - 407 - ], + "pos": { + "0": -29, + "1": 407, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 451.30548095703125, "1": 82 @@ -16,6 +24,7 @@ "flags": {}, "order": 0, "mode": 0, + "inputs": [], "outputs": [ { "name": "CLIP", @@ -39,10 +48,18 @@ { "id": 31, "type": "CogVideoTextEncode", - "pos": [ - 503, - 521 - ], + "pos": { + "0": 503, + "1": 521, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 463.01251220703125, "1": 98.10446166992188 @@ -78,10 +95,18 @@ { "id": 41, "type": "ImageResizeKJ", - "pos": [ - 206, - -69 - ], + "pos": { + "0": 206, + "1": -69, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 315, "1": 242 @@ -154,100 +179,21 @@ "disabled" ] }, - { - "id": 37, - "type": "CogVideoImageEncode", - "pos": [ - 939, - -53 - ], - "size": { - "0": 210, - "1": 46 - }, - "flags": {}, - "order": 9, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 83, - "slot_index": 0 - }, - { - "name": "image", - "type": "IMAGE", - "link": 129, - "slot_index": 1 - } - ], - "outputs": [ - { - "name": "samples", - "type": "LATENT", - "links": [ - 172 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoImageEncode" - } - }, - { - "id": 11, - "type": "CogVideoDecode", - "pos": [ - 1224, - 737 - ], - "size": { - "0": 210, - "1": 78 - }, - "flags": {}, - "order": 11, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 166 - }, - { - "name": "samples", - "type": "LATENT", - "link": 167 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 118 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - }, - "widgets_values": [ - false - ] - }, { "id": 30, "type": "CogVideoTextEncode", - "pos": [ - 500, - 308 - ], + "pos": { + "0": 500, + "1": 308, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 474.8450012207031, "1": 164.7423553466797 @@ -283,10 +229,18 @@ { "id": 57, "type": "GetImageSizeAndCount", - "pos": [ - 603, - -65 - ], + "pos": { + "0": 603, + "1": -65, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 202.2143096923828, "1": 99.23601531982422 @@ -332,7 +286,7 @@ "shape": 3 }, { - "name": "32 count", + "name": "33 count", "type": "INT", "links": [ 178, @@ -349,10 +303,18 @@ { "id": 45, "type": "VHS_LoadVideo", - "pos": [ - -93, - -153 - ], + "pos": { + "0": -93, + "1": -153, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": [ 235.1999969482422, 359.5999984741211 @@ -440,10 +402,18 @@ { "id": 70, "type": "GetImageSizeAndCount", - "pos": [ - 214, - -234 - ], + "pos": { + "0": 214, + "1": -234, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 202.2143096923828, "1": 99.23601531982422 @@ -484,7 +454,7 @@ "shape": 3 }, { - "name": "32 count", + "name": "33 count", "type": "INT", "links": [], "slot_index": 3, @@ -498,10 +468,18 @@ { "id": 69, "type": "INTConstant", - "pos": [ - -90, - -305 - ], + "pos": { + "0": -90, + "1": -305, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 210, "1": 58 @@ -509,6 +487,7 @@ "flags": {}, "order": 1, "mode": 0, + "inputs": [], "outputs": [ { "name": "value", @@ -529,13 +508,145 @@ "color": "#1b4669", "bgcolor": "#29699c" }, + { + "id": 47, + "type": "VHS_VideoCombine", + "pos": { + "0": 1560, + "1": -379, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": [ + 1110, + 310 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 132 + }, + { + "name": "audio", + "type": "VHS_AUDIO", + "link": null + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 8, + "loop_count": 0, + "filename_prefix": "CogVideoX_vid2vid", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "AnimateDiff_00001.mp4", + "subfolder": "", + "type": "temp", + "format": "video/h264-mp4", + "frame_rate": 8 + } + } + } + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": { + "0": 606, + "1": 85, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 315, + "1": 154 + }, + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 83, + 159 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "THUDM/CogVideoX-5b", + "fp16", + "disabled", + "disabled", + false + ] + }, { "id": 64, "type": "CogVideoSampler", - "pos": [ - 1090, - 290 - ], + "pos": { + "0": 1090, + "1": 290, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 312.9841613769531, "1": 342.8801574707031 @@ -627,98 +738,144 @@ 6, 9, "fixed", - "DDIM", + "DPM", "DDIM", 8, 0.85 ] }, { - "id": 1, - "type": "DownloadAndLoadCogVideoModel", - "pos": [ - 649, - 182 - ], - "size": { - "0": 315, - "1": 82 + "id": 11, + "type": "CogVideoDecode", + "pos": { + "0": 1097, + "1": 681, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 }, + "size": [ + 301.1664045038119, + 198 + ], "flags": {}, - "order": 2, - "mode": 0, - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 83, - 159 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoModel" - }, - "widgets_values": [ - "THUDM/CogVideoX-2b", - "fp16" - ] - }, - { - "id": 58, - "type": "ImageConcanate", - "pos": [ - 1499, - 433 - ], - "size": { - "0": 315, - "1": 102 - }, - "flags": {}, - "order": 13, + "order": 11, "mode": 0, "inputs": [ { - "name": "image1", - "type": "IMAGE", - "link": 191 + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 166 }, { - "name": "image2", - "type": "IMAGE", - "link": 170 + "name": "samples", + "type": "LATENT", + "link": 167 } ], "outputs": [ { - "name": "IMAGE", + "name": "images", "type": "IMAGE", "links": [ - 132 + 118 ], "slot_index": 0, "shape": 3 } ], "properties": { - "Node name for S&R": "ImageConcanate" + "Node name for S&R": "CogVideoDecode" }, "widgets_values": [ - "right", - false + false, + 96, + 96, + 0.083, + 0.083, + true + ] + }, + { + "id": 37, + "type": "CogVideoImageEncode", + "pos": { + "0": 975, + "1": -73, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 210, + "1": 122 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 83, + "slot_index": 0 + }, + { + "name": "image", + "type": "IMAGE", + "link": 129, + "slot_index": 1 + }, + { + "name": "mask", + "type": "MASK", + "link": null + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [ + 172 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoImageEncode" + }, + "widgets_values": [ + 8, + true ] }, { "id": 55, "type": "GetImageSizeAndCount", - "pos": [ - 1223, - 122 - ], + "pos": { + "0": 1195, + "1": 154, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 210, "1": 86 @@ -769,75 +926,57 @@ } }, { - "id": 47, - "type": "VHS_VideoCombine", - "pos": [ - 1560, - -379 - ], - "size": [ - 1110, - 711.3333333333333 - ], + "id": 58, + "type": "ImageConcanate", + "pos": { + "0": 1434, + "1": 289, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 315, + "1": 102 + }, "flags": {}, - "order": 14, + "order": 13, "mode": 0, "inputs": [ { - "name": "images", + "name": "image1", "type": "IMAGE", - "link": 132 + "link": 191 }, { - "name": "audio", - "type": "VHS_AUDIO", - "link": null - }, - { - "name": "meta_batch", - "type": "VHS_BatchManager", - "link": null - }, - { - "name": "vae", - "type": "VAE", - "link": null + "name": "image2", + "type": "IMAGE", + "link": 170 } ], "outputs": [ { - "name": "Filenames", - "type": "VHS_FILENAMES", - "links": null, + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 132 + ], + "slot_index": 0, "shape": 3 } ], "properties": { - "Node name for S&R": "VHS_VideoCombine" + "Node name for S&R": "ImageConcanate" }, - "widgets_values": { - "frame_rate": 8, - "loop_count": 0, - "filename_prefix": "CogVideoX_vid2vid", - "format": "video/h264-mp4", - "pix_fmt": "yuv420p", - "bitrate": 10, - "megabit": true, - "save_metadata": true, - "pingpong": false, - "save_output": false, - "videopreview": { - "hidden": false, - "paused": false, - "params": { - "filename": "AnimateDiff_00001.mp4", - "subfolder": "", - "type": "temp", - "format": "video/h264-mp4", - "frame_rate": 8 - } - } - } + "widgets_values": [ + "right", + false + ] } ], "links": [ @@ -1022,10 +1161,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.620921323059155, + "scale": 0.7513148009015777, "offset": [ - 298.59028824596885, - 694.562497939138 + 280.8935954961883, + 403.945992992638 ] } }, diff --git a/examples/cogvideox_5b_example_01.json b/examples/cogvideox_5b_example_01.json index 04fb30d..153960b 100644 --- a/examples/cogvideox_5b_example_01.json +++ b/examples/cogvideox_5b_example_01.json @@ -5,10 +5,18 @@ { "id": 31, "type": "CogVideoTextEncode", - "pos": [ - 503, - 521 - ], + "pos": { + "0": 503, + "1": 521, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 463.01251220703125, "1": 98.10446166992188 @@ -41,94 +49,25 @@ "" ] }, - { - "id": 11, - "type": "CogVideoDecode", - "pos": [ - 1140, - 783 - ], - "size": { - "0": 210, - "1": 78 - }, - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 63 - }, - { - "name": "samples", - "type": "LATENT", - "link": 64 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 59 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - }, - "widgets_values": [ - false - ] - }, - { - "id": 1, - "type": "DownloadAndLoadCogVideoModel", - "pos": [ - 649, - 182 - ], - "size": { - "0": 315, - "1": 82 - }, - "flags": {}, - "order": 0, - "mode": 0, - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 60 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoModel" - }, - "widgets_values": [ - "THUDM/CogVideoX-5b", - "bf16" - ] - }, { "id": 30, "type": "CogVideoTextEncode", - "pos": [ - 500, - 308 - ], - "size": [ - 471.90143257018326, - 168.0804709842023 - ], + "pos": { + "0": 500, + "1": 308, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 471.90142822265625, + "1": 168.08047485351562 + }, "flags": {}, "order": 2, "mode": 0, @@ -160,14 +99,22 @@ { "id": 34, "type": "CogVideoSampler", - "pos": [ - 1041, - 342 - ], - "size": [ - 315.84047081854465, - 358 - ], + "pos": { + "0": 1041, + "1": 342, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 315.8404846191406, + "1": 358 + }, "flags": {}, "order": 4, "mode": 0, @@ -231,13 +178,21 @@ { "id": 33, "type": "VHS_VideoCombine", - "pos": [ - 1441, - 129 - ], + "pos": { + "0": 1441, + "1": 129, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": [ 778.7022705078125, - 853.801513671875 + 310 ], "flags": {}, "order": 6, @@ -281,8 +236,7 @@ "filename_prefix": "CogVideoX5B", "format": "video/h264-mp4", "pix_fmt": "yuv420p", - "bitrate": 10, - "megabit": true, + "crf": 19, "save_metadata": true, "pingpong": false, "save_output": false, @@ -303,17 +257,26 @@ { "id": 20, "type": "CLIPLoader", - "pos": [ - -26, - 400 - ], + "pos": { + "0": -26, + "1": 400, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, "size": { "0": 451.30548095703125, "1": 82 }, "flags": {}, - "order": 1, + "order": 0, "mode": 0, + "inputs": [], "outputs": [ { "name": "CLIP", @@ -333,6 +296,108 @@ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", "sd3" ] + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": { + "0": 642, + "1": 90, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": { + "0": 315, + "1": 154 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 60 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "THUDM/CogVideoX-5b", + "bf16", + "disabled", + "disabled", + false + ] + }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": { + "0": 1051, + "1": 748, + "2": 0, + "3": 0, + "4": 0, + "5": 0, + "6": 0, + "7": 0, + "8": 0, + "9": 0 + }, + "size": [ + 300.3964783563508, + 198 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 63 + }, + { + "name": "samples", + "type": "LATENT", + "link": 64 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 59 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + }, + "widgets_values": [ + false, + 96, + 96, + 0.083, + 0.083, + true + ] } ], "links": [ @@ -405,10 +470,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.7513148009015777, + "scale": 0.8264462809917354, "offset": [ - 209.1392882550122, - 105.74671444060245 + 161.910286780368, + 124.7586178095323 ] } }, diff --git a/nodes.py b/nodes.py index 5da4dac..227545e 100644 --- a/nodes.py +++ b/nodes.py @@ -48,10 +48,6 @@ class DownloadAndLoadCogVideoModel: mm.soft_empty_cache() dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] - if fp8_transformer != "disabled": - transformer_dtype = torch.float8_e4m3fn - else: - transformer_dtype = dtype if "2b" in model: base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B") @@ -68,12 +64,15 @@ class DownloadAndLoadCogVideoModel: local_dir=base_path, local_dir_use_symlinks=False, ) - transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(transformer_dtype).to(offload_device) - if fp8_transformer == "fastmode": - from .fp8_optimization import convert_fp8_linear - convert_fp8_linear(transformer, dtype) + if fp8_transformer == "enabled" or fp8_transformer == "fastmode": + transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(torch.float8_e4m3fn).to(offload_device) + if fp8_transformer == "fastmode": + from .fp8_optimization import convert_fp8_linear + convert_fp8_linear(transformer, dtype) + else: + transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(dtype).to(offload_device) + vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device) - scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler") pipe = CogVideoXPipeline(vae, transformer, scheduler) @@ -95,8 +94,6 @@ class DownloadAndLoadCogVideoModel: fuse_qkv_projections=True, ) - - pipeline = { "pipe": pipe, "dtype": dtype, @@ -215,6 +212,8 @@ class CogVideoImageEncode: # mask = mask.unsqueeze(-1).repeat(1, 1, 1, C) # print(mask.shape) # input_image = input_image * (1 -mask) + else: + pipeline["pipe"].original_mask = None input_image = input_image * 2.0 - 1.0 input_image = input_image.to(vae.dtype).to(device) @@ -265,7 +264,7 @@ class CogVideoSampler: "steps": ("INT", {"default": 50, "min": 1}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), - "scheduler": (["DDIM", "DPM"], {"tooltip": "5B likes DPM, but it doesn't support temporal tiling"}), + "scheduler": (["DDIM", "DPM", "DDIM_tiled"], {"tooltip": "5B likes DPM, but it doesn't support temporal tiling"}), "t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1, "tooltip": "Length of temporal tiling, use same alue as num_frames to disable, disabled automatically for DPM"}), "t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1, "tooltip": "Overlap of temporal tiling"}), }, @@ -298,7 +297,7 @@ class CogVideoSampler: pipe.transformer.to(device) generator = torch.Generator(device=device).manual_seed(seed) - if scheduler == "DDIM": + if scheduler == "DDIM" or scheduler == "DDIM_tiled": pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler") elif scheduler == "DPM": pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler") @@ -324,7 +323,8 @@ class CogVideoSampler: prompt_embeds=positive.to(dtype).to(device), negative_prompt_embeds=negative.to(dtype).to(device), generator=generator, - device=device + device=device, + scheduler_name=scheduler ) if not pipeline["cpu_offloading"]: pipe.transformer.to(offload_device) diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index b19c06c..a713fbf 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -332,10 +332,11 @@ class CogVideoXPipeline(DiffusionPipeline): num_videos_per_prompt: int = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, device = torch.device("cuda"), + scheduler_name: str = "DPM", ): """ Function invoked when calling the pipeline for generation. @@ -421,8 +422,11 @@ class CogVideoXPipeline(DiffusionPipeline): if latents is None and num_frames == t_tile_length: num_frames += 1 - image_latents = latents - original_image_latents = image_latents + + if self.original_mask is not None: + image_latents = latents + original_image_latents = image_latents + latents, timesteps, noise = self.prepare_latents( batch_size * num_videos_per_prompt, latent_channels, @@ -439,15 +443,9 @@ class CogVideoXPipeline(DiffusionPipeline): ) latents = latents.to(self.transformer.dtype) - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype) - print("latents.shape", latents.shape) - print("latents.device", latents.device) - - # 6.5. Create rotary embeds if required image_rotary_emb = ( self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device) @@ -471,15 +469,23 @@ class CogVideoXPipeline(DiffusionPipeline): # 7. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) comfy_pbar = ProgressBar(num_inference_steps) + + # 8. Temporal tiling prep + if "tiled" in scheduler_name: + t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(self.vae.dtype) + temporal_tiling = True + print("Temporal tiling enabled") + else: + temporal_tiling = False + print("Temporal tiling disabled") + print("latents.shape", latents.shape) - with self.progress_bar(total=num_inference_steps) as progress_bar: - - # for DPM-solver++ - old_pred_original_sample = None + with self.progress_bar(total=num_inference_steps) as progress_bar: + old_pred_original_sample = None # for DPM-solver++ for i, t in enumerate(timesteps): if self.interrupt: continue - if not isinstance(self.scheduler, CogVideoXDPMScheduler): + if temporal_tiling and isinstance(self.scheduler, CogVideoXDDIMScheduler): #temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py # ===================================================== grid_ts = 0 @@ -532,12 +538,12 @@ class CogVideoXPipeline(DiffusionPipeline): noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0] + latents_tile = self.scheduler.step(noise_pred, t, latents_tile.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0] latents_all_list.append(latents_tile) # ========================================== - latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype) - contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype) + latents_all = torch.zeros(latents.shape, device=latents.device, dtype=self.vae.dtype) + contributors = torch.zeros(latents.shape, device=latents.device, dtype=self.vae.dtype) # Add each tile contribution to overall latents for t_i in range(grid_ts): if t_i < grid_ts - 1: @@ -573,7 +579,6 @@ class CogVideoXPipeline(DiffusionPipeline): comfy_pbar.update(1) # ========================================== else: - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) @@ -590,25 +595,28 @@ class CogVideoXPipeline(DiffusionPipeline): )[0] noise_pred = noise_pred.float() - - self._guidance_scale = 1 + guidance_scale * ( - (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2 - ) + if isinstance(self.scheduler, CogVideoXDPMScheduler): + self._guidance_scale = 1 + guidance_scale * ( + (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2 + ) if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents, old_pred_original_sample = self.scheduler.step( - noise_pred, - old_pred_original_sample, - t, - timesteps[i - 1] if i > 0 else None, - latents.to(self.vae.dtype), - **extra_step_kwargs, - return_dict=False, - ) + if not isinstance(self.scheduler, CogVideoXDPMScheduler): + latents = self.scheduler.step(noise_pred, t, latents.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0] + else: + latents, old_pred_original_sample = self.scheduler.step( + noise_pred, + old_pred_original_sample, + t, + timesteps[i - 1] if i > 0 else None, + latents.to(self.vae.dtype), + **extra_step_kwargs, + return_dict=False, + ) # start diff diff if i < len(timesteps) - 1 and self.original_mask is not None: noise_timestep = timesteps[i + 1]