From 3de01139277de7052a61849b6479bf0e2d1483a6 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:09:44 +0300 Subject: [PATCH] Separate CogVideoX-Fun vid2vid and control samplers, add automatic tile size for decode --- examples/cogvideox_fun_pose_example_01.json | 1174 +++++++++---------- nodes.py | 224 ++-- 2 files changed, 731 insertions(+), 667 deletions(-) diff --git a/examples/cogvideox_fun_pose_example_01.json b/examples/cogvideox_fun_pose_example_01.json index 7fe634e..e4c827f 100644 --- a/examples/cogvideox_fun_pose_example_01.json +++ b/examples/cogvideox_fun_pose_example_01.json @@ -1,6 +1,6 @@ { - "last_node_id": 77, - "last_link_id": 159, + "last_node_id": 80, + "last_link_id": 174, "nodes": [ { "id": 31, @@ -28,7 +28,7 @@ "name": "conditioning", "type": "CONDITIONING", "links": [ - 114 + 167 ], "slot_index": 0, "shape": 3 @@ -85,225 +85,6 @@ "" ] }, - { - "id": 37, - "type": "ImageResizeKJ", - "pos": { - "0": 666, - "1": 745 - }, - "size": { - "0": 315, - "1": 266 - }, - "flags": {}, - "order": 8, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 159 - }, - { - "name": "get_image_size", - "type": "IMAGE", - "link": null - }, - { - "name": "width_input", - "type": "INT", - "link": null, - "widget": { - "name": "width_input" - } - }, - { - "name": "height_input", - "type": "INT", - "link": null, - "widget": { - "name": "height_input" - } - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 130 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "width", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "height", - "type": "INT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "ImageResizeKJ" - }, - "widgets_values": [ - 512, - 512, - "lanczos", - true, - 8, - 0, - 0, - "disabled" - ] - }, - { - "id": 61, - "type": "GetImageSizeAndCount", - "pos": { - "0": 1024, - "1": 769 - }, - "size": { - "0": 277.20001220703125, - "1": 86 - }, - "flags": {}, - "order": 9, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 130 - } - ], - "outputs": [ - { - "name": "image", - "type": "IMAGE", - "links": [ - 131, - 135 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "512 width", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "368 height", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "49 count", - "type": "INT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "GetImageSizeAndCount" - }, - "widgets_values": [] - }, - { - "id": 20, - "type": "CLIPLoader", - "pos": { - "0": -26, - "1": 400 - }, - "size": { - "0": 451.30548095703125, - "1": 82 - }, - "flags": {}, - "order": 0, - "mode": 0, - "inputs": [], - "outputs": [ - { - "name": "CLIP", - "type": "CLIP", - "links": [ - 54, - 56 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CLIPLoader" - }, - "widgets_values": [ - "t5\\clip\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", - "sd3" - ] - }, - { - "id": 11, - "type": "CogVideoDecode", - "pos": { - "0": 1451, - "1": 363 - }, - "size": { - "0": 300.396484375, - "1": 198 - }, - "flags": {}, - "order": 11, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 115 - }, - { - "name": "samples", - "type": "LATENT", - "link": 116 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 124 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - }, - "widgets_values": [ - true, - 240, - 360, - 0.2, - 0.2, - true - ] - }, { "id": 59, "type": "AddLabel", @@ -368,12 +149,494 @@ "" ] }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": { + "0": 1451, + "1": 363 + }, + "size": { + "0": 282.7455749511719, + "1": 198 + }, + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 170 + }, + { + "name": "samples", + "type": "LATENT", + "link": 171 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 124 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + }, + "widgets_values": [ + true, + 240, + 360, + 0.2, + 0.2, + true + ] + }, + { + "id": 79, + "type": "CogVideoXFunControlSampler", + "pos": { + "0": 1085, + "1": 312 + }, + "size": { + "0": 313.41632080078125, + "1": 330 + }, + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 165 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 166 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 167 + }, + { + "name": "control_video", + "type": "IMAGE", + "link": 168 + }, + { + "name": "video_length", + "type": "INT", + "link": 169, + "widget": { + "name": "video_length" + } + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 170 + ], + "shape": 3 + }, + { + "name": "samples", + "type": "LATENT", + "links": [ + 171 + ], + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoXFunControlSampler" + }, + "widgets_values": [ + 49, + 512, + 42, + "fixed", + 25, + 6, + "DPM++", + 0.7000000000000001, + 0, + 1 + ] + }, + { + "id": 30, + "type": "CogVideoTextEncode", + "pos": { + "0": 513, + "1": 286 + }, + "size": { + "0": 471.90142822265625, + "1": 168.08047485351562 + }, + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 54 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 128, + 166 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "a brown bear is dancing in a forest, in front of a waterfall", + 1, + true + ] + }, + { + "id": 65, + "type": "VHS_LoadVideo", + "pos": { + "0": -191, + "1": 564 + }, + "size": [ + 390.1356201171875, + 910.0188802083334 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null + }, + { + "name": "frame_load_cap", + "type": "INT", + "link": 152, + "widget": { + "name": "frame_load_cap" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 173 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "frame_count", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "audio", + "type": "AUDIO", + "links": null, + "shape": 3 + }, + { + "name": "video_info", + "type": "VHS_VIDEOINFO", + "links": [], + "slot_index": 3, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_LoadVideo" + }, + "widgets_values": { + "video": "01.mp4", + "force_rate": 0, + "force_size": "Disabled", + "custom_width": 512, + "custom_height": 512, + "frame_load_cap": 17, + "skip_first_frames": 0, + "select_every_nth": 1, + "choose video to upload": "image", + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "frame_load_cap": 17, + "skip_first_frames": 0, + "force_rate": 0, + "filename": "01.mp4", + "type": "input", + "format": "video/mp4", + "select_every_nth": 1 + }, + "muted": false + } + } + }, + { + "id": 20, + "type": "CLIPLoader", + "pos": { + "0": 2, + "1": 412 + }, + "size": { + "0": 451.30548095703125, + "1": 82 + }, + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 54, + 56 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5\\clip\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", + "sd3" + ] + }, + { + "id": 80, + "type": "DWPreprocessor", + "pos": { + "0": 260, + "1": 742 + }, + "size": { + "0": 364.7358703613281, + "1": 198 + }, + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 173 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 174 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "POSE_KEYPOINT", + "type": "POSE_KEYPOINT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "DWPreprocessor" + }, + "widgets_values": [ + "enable", + "enable", + "enable", + 512, + "yolox_l.torchscript.pt", + "dw-ll_ucoco_384_bs5.torchscript.pt" + ] + }, + { + "id": 37, + "type": "ImageResizeKJ", + "pos": { + "0": 666, + "1": 743 + }, + "size": { + "0": 315, + "1": 266 + }, + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 174 + }, + { + "name": "get_image_size", + "type": "IMAGE", + "link": null + }, + { + "name": "width_input", + "type": "INT", + "link": null, + "widget": { + "name": "width_input" + } + }, + { + "name": "height_input", + "type": "INT", + "link": null, + "widget": { + "name": "height_input" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 130 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "height", + "type": "INT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "ImageResizeKJ" + }, + "widgets_values": [ + 512, + 512, + "lanczos", + true, + 16, + 0, + 0, + "disabled" + ] + }, + { + "id": 61, + "type": "GetImageSizeAndCount", + "pos": { + "0": 1018, + "1": 743 + }, + "size": { + "0": 277.20001220703125, + "1": 86 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 130 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 135, + 168 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "288 width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "512 height", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "49 count", + "type": "INT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + }, + "widgets_values": [] + }, { "id": 58, "type": "ImageConcatMulti", "pos": { - "0": 1476, - "1": 714 + "0": 1439, + "1": 735 }, "size": { "0": 210, @@ -414,57 +677,63 @@ ] }, { - "id": 30, - "type": "CogVideoTextEncode", + "id": 71, + "type": "DownloadAndLoadCogVideoGGUFModel", "pos": { - "0": 513, - "1": 286 + "0": 515, + "1": 35 }, "size": { - "0": 471.90142822265625, - "1": 168.08047485351562 + "0": 466.3737487792969, + "1": 174 }, "flags": {}, - "order": 3, + "order": 1, "mode": 0, "inputs": [ { - "name": "clip", - "type": "CLIP", - "link": 54 + "name": "pab_config", + "type": "PAB_CONFIG", + "link": null + }, + { + "name": "block_edit", + "type": "TRANSFORMERBLOCKS", + "link": null } ], "outputs": [ { - "name": "conditioning", - "type": "CONDITIONING", + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", "links": [ - 113, - 128 + 165 ], "slot_index": 0, "shape": 3 } ], "properties": { - "Node name for S&R": "CogVideoTextEncode" + "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel" }, "widgets_values": [ - "fireball travels across a movie scene", - 1, - true + "CogVideoX_5b_fun_1_1_Pose_GGUF_Q4_0.safetensors", + "bf16", + false, + "main_device", + false ] }, { "id": 44, "type": "VHS_VideoCombine", "pos": { - "0": 1847, - "1": -22 + "0": 1842, + "1": -5 }, "size": [ - 1635.8468017578125, - 980.4377632141113 + 1186.0863037109375, + 1442.1649487639127 ], "flags": {}, "order": 14, @@ -516,7 +785,7 @@ "hidden": false, "paused": false, "params": { - "filename": "CogVideoX_Fun_00054.mp4", + "filename": "CogVideoX_Fun_Pose_00004.mp4", "subfolder": "", "type": "temp", "format": "video/h264-mp4", @@ -526,162 +795,19 @@ } } }, - { - "id": 56, - "type": "DWPreprocessor", - "pos": { - "0": 211, - "1": 746 - }, - "size": { - "0": 371.6333312988281, - "1": 222 - }, - "flags": {}, - "order": 7, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 158 - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 159 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "POSE_KEYPOINT", - "type": "POSE_KEYPOINT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "DWPreprocessor" - }, - "widgets_values": [ - "enable", - "enable", - "enable", - 512, - "yolox_l.torchscript.pt", - "dw-ll_ucoco_384_bs5.torchscript.pt" - ] - }, - { - "id": 65, - "type": "VHS_LoadVideo", - "pos": { - "0": -510, - "1": 568 - }, - "size": [ - 642.7533569335938, - 702.6101525779661 - ], - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [ - { - "name": "meta_batch", - "type": "VHS_BatchManager", - "link": null - }, - { - "name": "vae", - "type": "VAE", - "link": null - }, - { - "name": "frame_load_cap", - "type": "INT", - "link": 152, - "widget": { - "name": "frame_load_cap" - } - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 158 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "frame_count", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "audio", - "type": "AUDIO", - "links": null, - "shape": 3 - }, - { - "name": "video_info", - "type": "VHS_VIDEOINFO", - "links": [], - "slot_index": 3, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "VHS_LoadVideo" - }, - "widgets_values": { - "video": "pose_slide.mp4", - "force_rate": 0, - "force_size": "Disabled", - "custom_width": 512, - "custom_height": 512, - "frame_load_cap": 17, - "skip_first_frames": 0, - "select_every_nth": 2, - "choose video to upload": "image", - "videopreview": { - "hidden": false, - "paused": false, - "params": { - "frame_load_cap": 17, - "skip_first_frames": 0, - "force_rate": 0, - "filename": "pose_slide.mp4", - "type": "input", - "format": "video/mp4", - "select_every_nth": 2 - }, - "muted": false - } - } - }, { "id": 72, "type": "INTConstant", "pos": { - "0": -515, - "1": 288 + "0": -265, + "1": 347 }, "size": { "0": 210, "1": 58 }, "flags": {}, - "order": 1, + "order": 2, "mode": 0, "inputs": [], "outputs": [ @@ -689,8 +815,8 @@ "name": "value", "type": "INT", "links": [ - 151, - 152 + 152, + 169 ], "slot_index": 0, "shape": 3 @@ -705,138 +831,6 @@ ], "color": "#1b4669", "bgcolor": "#29699c" - }, - { - "id": 71, - "type": "DownloadAndLoadCogVideoGGUFModel", - "pos": { - "0": 478, - "1": -3 - }, - "size": { - "0": 466.3737487792969, - "1": 174 - }, - "flags": {}, - "order": 2, - "mode": 0, - "inputs": [ - { - "name": "pab_config", - "type": "PAB_CONFIG", - "link": null - }, - { - "name": "block_edit", - "type": "TRANSFORMERBLOCKS", - "link": null - } - ], - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 148 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel" - }, - "widgets_values": [ - "CogVideoX_5b_fun_1_1_Pose_GGUF_Q4_0.safetensors", - "bf16", - false, - "main_device", - false - ] - }, - { - "id": 54, - "type": "CogVideoXFunVid2VidSampler", - "pos": { - "0": 1067, - "1": 283 - }, - "size": { - "0": 315, - "1": 378 - }, - "flags": {}, - "order": 10, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 148 - }, - { - "name": "positive", - "type": "CONDITIONING", - "link": 113 - }, - { - "name": "negative", - "type": "CONDITIONING", - "link": 114 - }, - { - "name": "validation_video", - "type": "IMAGE", - "link": null - }, - { - "name": "control_video", - "type": "IMAGE", - "link": 131 - }, - { - "name": "video_length", - "type": "INT", - "link": 151, - "widget": { - "name": "video_length" - } - } - ], - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 115 - ], - "shape": 3 - }, - { - "name": "samples", - "type": "LATENT", - "links": [ - 116 - ], - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoXFunVid2VidSampler" - }, - "widgets_values": [ - 17, - 512, - 88311810545489, - "fixed", - 25, - 6, - "DPM++", - 1, - 0.7000000000000001, - 0, - 1 - ] } ], "links": [ @@ -856,38 +850,6 @@ 0, "CLIP" ], - [ - 113, - 30, - 0, - 54, - 1, - "CONDITIONING" - ], - [ - 114, - 31, - 0, - 54, - 2, - "CONDITIONING" - ], - [ - 115, - 54, - 0, - 11, - 0, - "COGVIDEOPIPE" - ], - [ - 116, - 54, - 1, - 11, - 1, - "LATENT" - ], [ 124, 11, @@ -928,14 +890,6 @@ 0, "IMAGE" ], - [ - 131, - 61, - 0, - 54, - 4, - "IMAGE" - ], [ 135, 61, @@ -944,14 +898,6 @@ 0, "IMAGE" ], - [ - 148, - 71, - 0, - 54, - 0, - "COGVIDEOPIPE" - ], [ 150, 59, @@ -960,14 +906,6 @@ 0, "IMAGE" ], - [ - 151, - 72, - 0, - 54, - 5, - "INT" - ], [ 152, 72, @@ -977,16 +915,72 @@ "INT" ], [ - 158, + 165, + 71, + 0, + 79, + 0, + "COGVIDEOPIPE" + ], + [ + 166, + 30, + 0, + 79, + 1, + "CONDITIONING" + ], + [ + 167, + 31, + 0, + 79, + 2, + "CONDITIONING" + ], + [ + 168, + 61, + 0, + 79, + 3, + "IMAGE" + ], + [ + 169, + 72, + 0, + 79, + 4, + "INT" + ], + [ + 170, + 79, + 0, + 11, + 0, + "COGVIDEOPIPE" + ], + [ + 171, + 79, + 1, + 11, + 1, + "LATENT" + ], + [ + 173, 65, 0, - 56, + 80, 0, "IMAGE" ], [ - 159, - 56, + 174, + 80, 0, 37, 0, @@ -997,10 +991,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.5730855330117133, + "scale": 0.5209868481924667, "offset": [ - 798.5395320681218, - 157.60944992071092 + 329.16752736137005, + 119.68471403460902 ] } }, diff --git a/nodes.py b/nodes.py index 41d3ec7..95811ff 100644 --- a/nodes.py +++ b/nodes.py @@ -532,30 +532,13 @@ class DownloadAndLoadCogVideoGGUFModel: vae.load_state_dict(vae_sd) pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config) - # compilation - # if compile == "torch": - # torch._dynamo.config.suppress_errors = True - # pipe.transformer.to(memory_format=torch.channels_last) - # pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) - # elif compile == "onediff": - # from onediffx import compile_pipe - # os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1' - - # pipe = compile_pipe( - # pipe, - # backend="nexfort", - # options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}}, - # ignores=["vae"], - # fuse_qkv_projections=True, - # ) - if enable_sequential_cpu_offload: pipe.enable_sequential_cpu_offload() pipeline = { "pipe": pipe, "dtype": vae_dtype, - "base_path": "Fun" if "fun" in model else "sad", + "base_path": model, "onediff": True if compile == "onediff" else False, "cpu_offloading": enable_sequential_cpu_offload, "scheduler_config": scheduler_config @@ -833,7 +816,7 @@ class CogVideoSampler: base_path = pipeline["base_path"] - assert "Fun" not in base_path, "'Fun' models not supported in 'CogVideoSampler', use the 'CogVideoXFunSampler'" + assert "fun" not in base_path.lower(), "'Fun' models not supported in 'CogVideoSampler', use the 'CogVideoXFunSampler'" assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap" assert t_tile_length <= num_frames, "t_tile_length must be equal or less than num_frames" t_tile_length = t_tile_length // 4 @@ -898,7 +881,7 @@ class CogVideoDecode: "tile_sample_min_width": ("INT", {"default": 360, "min": 16, "max": 2048, "step": 8, "tooltip": "Minimum tile width, default is half the width"}), "tile_overlap_factor_height": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.001}), "tile_overlap_factor_width": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.001}), - "enable_vae_slicing": ("BOOLEAN", {"default": True, "tooltip": "VAE will split the input tensor in slices to compute decoding in several steps. This is useful to save some memory and allow larger batch sizes."}), + "auto_tile_size": ("BOOLEAN", {"default": True, "tooltip": "Auto size based on height and width, default is half the size"}), } } @@ -907,24 +890,26 @@ class CogVideoDecode: FUNCTION = "decode" CATEGORY = "CogVideoWrapper" - def decode(self, pipeline, samples, enable_vae_tiling, tile_sample_min_height, tile_sample_min_width, tile_overlap_factor_height, tile_overlap_factor_width, enable_vae_slicing=True): + def decode(self, pipeline, samples, enable_vae_tiling, tile_sample_min_height, tile_sample_min_width, tile_overlap_factor_height, tile_overlap_factor_width, auto_tile_size=True): device = mm.get_torch_device() offload_device = mm.unet_offload_device() latents = samples["samples"] vae = pipeline["pipe"].vae - if enable_vae_slicing: - vae.enable_slicing() - else: - vae.disable_slicing() + + vae.enable_slicing() + if not pipeline["cpu_offloading"]: vae.to(device) if enable_vae_tiling: - vae.enable_tiling( - tile_sample_min_height=tile_sample_min_height, - tile_sample_min_width=tile_sample_min_width, - tile_overlap_factor_height=tile_overlap_factor_height, - tile_overlap_factor_width=tile_overlap_factor_width, - ) + if auto_tile_size: + vae.enable_tiling() + else: + vae.enable_tiling( + tile_sample_min_height=tile_sample_min_height, + tile_sample_min_width=tile_sample_min_width, + tile_overlap_factor_height=tile_overlap_factor_height, + tile_overlap_factor_width=tile_overlap_factor_width, + ) else: vae.disable_tiling() latents = latents.to(vae.dtype) @@ -1005,7 +990,8 @@ class CogVideoXFunSampler: pipe = pipeline["pipe"] dtype = pipeline["dtype"] base_path = pipeline["base_path"] - assert "Fun" in base_path, "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'" + assert "fun" in base_path.lower(), "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'" + assert "pose" not in base_path.lower(), "'Pose' models not supported in 'CogVideoXFunSampler', use the 'CogVideoXFunControlSampler'" if not pipeline["cpu_offloading"]: pipe.enable_model_cpu_offload(device=device) @@ -1075,19 +1061,10 @@ class CogVideoXFunVid2VidSampler: "negative": ("CONDITIONING", ), "video_length": ("INT", {"default": 49, "min": 5, "max": 49, "step": 4}), "base_resolution": ( - [ - 256, - 320, - 384, - 448, - 512, - 768, - 960, - 1024, - ], {"default": 768} + [256,320,384,448,512,768,960,1024,], {"default": 512} ), - "seed": ("INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}), - "steps": ("INT", {"default": 50, "min": 1, "max": 200, "step": 1}), + "seed": ("INT", {"default": 42, "min": 0, "max": 0xffffffffffffffff}), + "steps": ("INT", {"default": 25, "min": 1, "max": 200, "step": 1}), "cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}), "scheduler": ( [ @@ -1108,13 +1085,7 @@ class CogVideoXFunVid2VidSampler: } ), "denoise_strength": ("FLOAT", {"default": 0.70, "min": 0.05, "max": 1.00, "step": 0.01}), - }, - "optional":{ "validation_video": ("IMAGE",), - "control_video": ("IMAGE",), - "control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}), - "control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), }, } @@ -1124,14 +1095,15 @@ class CogVideoXFunVid2VidSampler: CATEGORY = "CogVideoWrapper" def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler, - validation_video=None, control_video=None, control_strength=1.0, control_start_percent=0.0, control_end_percent=1.0): + validation_video): device = mm.get_torch_device() offload_device = mm.unet_offload_device() pipe = pipeline["pipe"] dtype = pipeline["dtype"] base_path = pipeline["base_path"] - assert "Fun" in base_path, "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'" + assert "fun" in base_path.lower(), "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'" + assert "pose" not in base_path.lower(), "'Pose' models not supported in 'CogVideoXFunVid2VidSampler', use the 'CogVideoXFunControlSampler'" if not pipeline["cpu_offloading"]: pipe.enable_model_cpu_offload(device=device) @@ -1141,12 +1113,8 @@ class CogVideoXFunVid2VidSampler: # Count most suitable height and width aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()} - if validation_video is not None: - validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8) - original_width, original_height = Image.fromarray(validation_video[0]).size - elif control_video is not None: - control_video = np.array(control_video.cpu().numpy() * 255, np.uint8) - original_width, original_height = Image.fromarray(control_video[0]).size + validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8) + original_width, original_height = Image.fromarray(validation_video[0]).size closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size) height, width = [int(x / 16) * 16 for x in closest_size] @@ -1165,10 +1133,7 @@ class CogVideoXFunVid2VidSampler: autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext() with autocast_context: video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1 - if validation_video is not None: - input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width)) - elif control_video is not None: - input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width)) + input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width)) # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])): # pipeline = merge_lora(pipeline, _lora_path, _lora_weight) @@ -1185,21 +1150,124 @@ class CogVideoXFunVid2VidSampler: "comfyui_progressbar": True, } - if control_video is not None: - latents = pipe( - **common_params, - control_video=input_video, - control_strength=control_strength, - control_start_percent=control_start_percent, - control_end_percent=control_end_percent - ) - else: - latents = pipe( - **common_params, - video=input_video, - mask_video=input_video_mask, - strength=float(denoise_strength) - ) + latents = pipe( + **common_params, + video=input_video, + mask_video=input_video_mask, + strength=float(denoise_strength) + ) + + # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])): + # pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight) + return (pipeline, {"samples": latents}) + +class CogVideoXFunControlSampler: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("COGVIDEOPIPE",), + "positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), + "video_length": ("INT", {"default": 49, "min": 5, "max": 49, "step": 4}), + "base_resolution": ( + [256,320,384,448,512,768,960,1024,], {"default": 512} + ), + "seed": ("INT", {"default": 42, "min": 0, "max": 0xffffffffffffffff}), + "steps": ("INT", {"default": 25, "min": 1, "max": 200, "step": 1}), + "cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}), + "scheduler": ( + [ + "Euler", + "Euler A", + "DPM++", + "PNDM", + "DDIM", + "SASolverScheduler", + "UniPCMultistepScheduler", + "HeunDiscreteScheduler", + "DEISMultistepScheduler", + "CogVideoXDDIM", + "CogVideoXDPMScheduler", + ], + { + "default": 'DDIM' + } + ), + "control_video": ("IMAGE",), + "control_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), + "control_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}), + "control_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), + }, + } + + RETURN_TYPES = ("COGVIDEOPIPE", "LATENT",) + RETURN_NAMES = ("cogvideo_pipe", "samples",) + FUNCTION = "process" + CATEGORY = "CogVideoWrapper" + + def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler, + control_video=None, control_strength=1.0, control_start_percent=0.0, control_end_percent=1.0): + device = mm.get_torch_device() + offload_device = mm.unet_offload_device() + pipe = pipeline["pipe"] + dtype = pipeline["dtype"] + base_path = pipeline["base_path"] + + assert "fun" in base_path.lower(), "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'" + + if not pipeline["cpu_offloading"]: + pipe.enable_model_cpu_offload(device=device) + + mm.soft_empty_cache() + + # Count most suitable height and width + aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()} + + control_video = np.array(control_video.cpu().numpy() * 255, np.uint8) + original_width, original_height = Image.fromarray(control_video[0]).size + + closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size) + height, width = [int(x / 16) * 16 for x in closest_size] + + # Load Sampler + scheduler_config = pipeline["scheduler_config"] + if scheduler in scheduler_mapping: + noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config) + pipe.scheduler = noise_scheduler + else: + raise ValueError(f"Unknown scheduler: {scheduler}") + + generator= torch.Generator(device).manual_seed(seed) + + autocastcondition = not pipeline["onediff"] + autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext() + with autocast_context: + video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1 + input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=(height, width)) + + # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])): + # pipeline = merge_lora(pipeline, _lora_path, _lora_weight) + + common_params = { + "prompt_embeds": positive.to(dtype).to(device), + "negative_prompt_embeds": negative.to(dtype).to(device), + "num_frames": video_length, + "height": height, + "width": width, + "generator": generator, + "guidance_scale": cfg, + "num_inference_steps": steps, + "comfyui_progressbar": True, + } + + latents = pipe( + **common_params, + control_video=input_video, + control_strength=control_strength, + control_start_percent=control_start_percent, + control_end_percent=control_end_percent + ) # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])): # pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight) @@ -1214,6 +1282,7 @@ NODE_CLASS_MAPPINGS = { "CogVideoImageEncode": CogVideoImageEncode, "CogVideoXFunSampler": CogVideoXFunSampler, "CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler, + "CogVideoXFunControlSampler": CogVideoXFunControlSampler, "CogVideoTextEncodeCombine": CogVideoTextEncodeCombine, "DownloadAndLoadCogVideoGGUFModel": DownloadAndLoadCogVideoGGUFModel, "CogVideoPABConfig": CogVideoPABConfig, @@ -1228,6 +1297,7 @@ NODE_DISPLAY_NAME_MAPPINGS = { "CogVideoImageEncode": "CogVideo ImageEncode", "CogVideoXFunSampler": "CogVideoXFun Sampler", "CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler", + "CogVideoXFunControlSampler": "CogVideoXFun Control Sampler", "CogVideoTextEncodeCombine": "CogVideo TextEncode Combine", "DownloadAndLoadCogVideoGGUFModel": "(Down)load CogVideo GGUF Model", "CogVideoPABConfig": "CogVideo PABConfig",