diff --git a/cogvideox_fun/lora_utils.py b/cogvideox_fun/lora_utils.py index 42038a5..3191c0c 100644 --- a/cogvideox_fun/lora_utils.py +++ b/cogvideox_fun/lora_utils.py @@ -474,4 +474,51 @@ def unmerge_lora(pipeline, lora_path, multiplier=1, device="cpu", dtype=torch.fl else: curr_layer.weight.data -= multiplier * alpha * torch.mm(weight_up, weight_down) - return pipeline \ No newline at end of file + return pipeline + +def load_lora_into_transformer(state_dict, transformer, adapter_name=None): + from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict + from diffusers.utils.peft_utils import get_peft_kwargs, get_adapter_name + from diffusers.utils.import_utils import is_peft_version + from diffusers.utils.state_dict_utils import convert_unet_state_dict_to_peft + keys = list(state_dict.keys()) + transformer_keys = [k for k in keys if k.startswith("transformer")] + state_dict = { + k.replace(f"transformer.", ""): v for k, v in state_dict.items() if k in transformer_keys + } + if len(state_dict.keys()) > 0: + # check with first key if is not in peft format + first_key = next(iter(state_dict.keys())) + if "lora_A" not in first_key: + state_dict = convert_unet_state_dict_to_peft(state_dict) + if adapter_name in getattr(transformer, "peft_config", {}): + raise ValueError( + f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name." + ) + rank = {} + for key, val in state_dict.items(): + if "lora_B" in key: + rank[key] = val.shape[1] + lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict) + if "use_dora" in lora_config_kwargs: + if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"): + raise ValueError( + "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`." + ) + else: + lora_config_kwargs.pop("use_dora") + lora_config = LoraConfig(**lora_config_kwargs) + # adapter_name + if adapter_name is None: + adapter_name = get_adapter_name(transformer) + + inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name) + incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name) + if incompatible_keys is not None: + # check only for unexpected keys + unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None) + if unexpected_keys: + print( + f"Loading adapter weights from state_dict led to unexpected keys not found in the model: " + f" {unexpected_keys}. " + ) \ No newline at end of file diff --git a/examples/cogvideo_5b_temporal_tiling_long_01.json b/examples/cogvideo_2b_context_schedule_test_01.json similarity index 79% rename from examples/cogvideo_5b_temporal_tiling_long_01.json rename to examples/cogvideo_2b_context_schedule_test_01.json index 79b00e2..ed7b6fa 100644 --- a/examples/cogvideo_5b_temporal_tiling_long_01.json +++ b/examples/cogvideo_2b_context_schedule_test_01.json @@ -1,28 +1,75 @@ { - "last_node_id": 33, - "last_link_id": 60, + "last_node_id": 34, + "last_link_id": 61, "nodes": [ + { + "id": 33, + "type": "GetImageSizeAndCount", + "pos": { + "0": 1176, + "1": 122 + }, + "size": { + "0": 210, + "1": 86 + }, + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 59 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 60 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "720 width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "480 height", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "104 count", + "type": "INT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + }, + "widgets_values": [] + }, { "id": 30, "type": "CogVideoTextEncode", "pos": { "0": 500, - "1": 308, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": { - "0": 474.8450012207031, - "1": 164.7423553466797 + "1": 308 }, + "size": [ + 474.8035864085422, + 211.10369504535595 + ], "flags": {}, - "order": 2, + "order": 3, "mode": 0, "inputs": [ { @@ -46,23 +93,58 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance." + "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance.", + 1, + true + ] + }, + { + "id": 31, + "type": "CogVideoTextEncode", + "pos": { + "0": 508, + "1": 576 + }, + "size": { + "0": 463.01251220703125, + "1": 124 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 56 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 57 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "", + 1, + true ] }, { "id": 20, "type": "CLIPLoader", "pos": { - "0": -59, - "1": 397, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "0": -37, + "1": 443 }, "size": { "0": 451.30548095703125, @@ -93,50 +175,105 @@ ] }, { - "id": 31, - "type": "CogVideoTextEncode", + "id": 11, + "type": "CogVideoDecode", "pos": { - "0": 503, - "1": 521, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "0": 1045, + "1": 776 }, "size": { - "0": 463.01251220703125, - "1": 98.10446166992188 + "0": 295.70111083984375, + "1": 198 }, "flags": {}, - "order": 3, + "order": 6, "mode": 0, "inputs": [ { - "name": "clip", - "type": "CLIP", - "link": 56 + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 37 + }, + { + "name": "samples", + "type": "LATENT", + "link": 38 } ], "outputs": [ { - "name": "conditioning", - "type": "CONDITIONING", + "name": "images", + "type": "IMAGE", "links": [ - 57 + 59 ], "slot_index": 0, "shape": 3 } ], "properties": { - "Node name for S&R": "CogVideoTextEncode" + "Node name for S&R": "CogVideoDecode" }, "widgets_values": [ - "" + true, + 96, + 96, + 0.083, + 0.083, + true + ] + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": { + "0": 652, + "1": 43 + }, + "size": { + "0": 315, + "1": 194 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "pab_config", + "type": "PAB_CONFIG", + "link": null + }, + { + "name": "block_edit", + "type": "TRANSFORMERBLOCKS", + "link": null + }, + { + "name": "lora", + "type": "COGLORA", + "link": null + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 36 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "THUDM/CogVideoX-2b", + "fp16", + "enabled", + "disabled", + false ] }, { @@ -144,22 +281,14 @@ "type": "VHS_VideoCombine", "pos": { "0": 1439, - "1": 122, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 122 }, "size": [ 563.3333740234375, - 688.2124814882384 + 686.2222493489583 ], "flags": {}, - "order": 7, + "order": 8, "mode": 0, "inputs": [ { @@ -209,7 +338,7 @@ "hidden": false, "paused": false, "params": { - "filename": "CogVideo2B_long_00001.mp4", + "filename": "CogVideo2B_long_00005.mp4", "subfolder": "", "type": "temp", "format": "video/h264-mp4", @@ -219,48 +348,39 @@ } }, { - "id": 1, - "type": "DownloadAndLoadCogVideoModel", + "id": 34, + "type": "CogVideoContextOptions", "pos": { - "0": 653, - "1": 90, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "0": 1053, + "1": -84 }, "size": { "0": 315, "1": 154 }, "flags": {}, - "order": 1, + "order": 2, "mode": 0, "inputs": [], "outputs": [ { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", + "name": "context_options", + "type": "COGCONTEXT", "links": [ - 36 + 61 ], - "slot_index": 0, "shape": 3 } ], "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoModel" + "Node name for S&R": "CogVideoContextOptions" }, "widgets_values": [ - "THUDM/CogVideoX-2b", - "fp16", - "disabled", - "disabled", - false + "uniform_standard", + 52, + 4, + 8, + true ] }, { @@ -268,22 +388,14 @@ "type": "CogVideoSampler", "pos": { "0": 1041, - "1": 342, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 342 }, "size": { "0": 315, "1": 382 }, "flags": {}, - "order": 4, + "order": 5, "mode": 0, "inputs": [ { @@ -306,6 +418,16 @@ "name": "samples", "type": "LATENT", "link": null + }, + { + "name": "image_cond_latents", + "type": "LATENT", + "link": null + }, + { + "name": "context_options", + "type": "COGCONTEXT", + "link": 61 } ], "outputs": [ @@ -332,135 +454,14 @@ "widgets_values": [ 480, 720, - 96, - 25, + 104, + 32, 6, - 6, - 806286757407563, - "DDIM_tiled", - 48, - 8, + 42, + "fixed", + "CogVideoXDDIM", 1 ] - }, - { - "id": 11, - "type": "CogVideoDecode", - "pos": { - "0": 1049, - "1": 772, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": [ - 295.70112532900725, - 198 - ], - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 37 - }, - { - "name": "samples", - "type": "LATENT", - "link": 38 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 59 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - }, - "widgets_values": [ - true, - 96, - 96, - 0.083, - 0.083, - true - ] - }, - { - "id": 33, - "type": "GetImageSizeAndCount", - "pos": { - "0": 1176, - "1": 122, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": { - "0": 210, - "1": 86 - }, - "flags": {}, - "order": 6, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 59 - } - ], - "outputs": [ - { - "name": "image", - "type": "IMAGE", - "links": [ - 60 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "728 width", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "485 height", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "96 count", - "type": "INT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "GetImageSizeAndCount" - } } ], "links": [ @@ -535,18 +536,26 @@ 32, 0, "IMAGE" + ], + [ + 61, + 34, + 0, + 22, + 5, + "COGCONTEXT" ] ], "groups": [], "config": {}, "extra": { "ds": { - "scale": 0.7513148009015777, + "scale": 0.8390545288825444, "offset": [ - 253.3863163213836, - 255.76127216744268 + -14.198557467892236, + 144.90015432747748 ] } }, "version": 0.4 -} +} \ No newline at end of file diff --git a/examples/cogvideo_5b_vid2vid_example_01.json b/examples/cogvideo_5b_vid2vid_example_01.json index 4aba556..c9eeffa 100644 --- a/examples/cogvideo_5b_vid2vid_example_01.json +++ b/examples/cogvideo_5b_vid2vid_example_01.json @@ -7,15 +7,7 @@ "type": "CLIPLoader", "pos": { "0": -29, - "1": 407, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 407 }, "size": { "0": 451.30548095703125, @@ -50,19 +42,11 @@ "type": "CogVideoTextEncode", "pos": { "0": 503, - "1": 521, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 521 }, "size": { "0": 463.01251220703125, - "1": 98.10446166992188 + "1": 124 }, "flags": {}, "order": 4, @@ -89,7 +73,9 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "" + "", + 1, + true ] }, { @@ -97,15 +83,7 @@ "type": "ImageResizeKJ", "pos": { "0": 206, - "1": -69, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": -69 }, "size": { "0": 315, @@ -184,15 +162,7 @@ "type": "CogVideoTextEncode", "pos": { "0": 500, - "1": 308, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 308 }, "size": { "0": 474.8450012207031, @@ -223,7 +193,9 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness." + "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness.", + 1, + true ] }, { @@ -231,15 +203,7 @@ "type": "GetImageSizeAndCount", "pos": { "0": 603, - "1": -65, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": -65 }, "size": { "0": 202.2143096923828, @@ -289,8 +253,7 @@ "name": "33 count", "type": "INT", "links": [ - 178, - 181 + 178 ], "slot_index": 3, "shape": 3 @@ -298,26 +261,19 @@ ], "properties": { "Node name for S&R": "GetImageSizeAndCount" - } + }, + "widgets_values": [] }, { "id": 45, "type": "VHS_LoadVideo", "pos": { "0": -93, - "1": -153, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": -153 }, "size": [ - 235.1999969482422, - 359.5999984741211 + 247.455078125, + 365.7275390625 ], "flags": {}, "order": 5, @@ -404,15 +360,7 @@ "type": "GetImageSizeAndCount", "pos": { "0": 214, - "1": -234, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": -234 }, "size": { "0": 202.2143096923828, @@ -463,22 +411,15 @@ ], "properties": { "Node name for S&R": "GetImageSizeAndCount" - } + }, + "widgets_values": [] }, { "id": 69, "type": "INTConstant", "pos": { "0": -90, - "1": -305, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": -305 }, "size": { "0": 210, @@ -508,107 +449,37 @@ "color": "#1b4669", "bgcolor": "#29699c" }, - { - "id": 47, - "type": "VHS_VideoCombine", - "pos": { - "0": 1560, - "1": -379, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": [ - 1110, - 310 - ], - "flags": {}, - "order": 14, - "mode": 0, - "inputs": [ - { - "name": "images", - "type": "IMAGE", - "link": 132 - }, - { - "name": "audio", - "type": "VHS_AUDIO", - "link": null - }, - { - "name": "meta_batch", - "type": "VHS_BatchManager", - "link": null - }, - { - "name": "vae", - "type": "VAE", - "link": null - } - ], - "outputs": [ - { - "name": "Filenames", - "type": "VHS_FILENAMES", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "VHS_VideoCombine" - }, - "widgets_values": { - "frame_rate": 8, - "loop_count": 0, - "filename_prefix": "CogVideoX_vid2vid", - "format": "video/h264-mp4", - "pix_fmt": "yuv420p", - "crf": 19, - "save_metadata": true, - "pingpong": false, - "save_output": false, - "videopreview": { - "hidden": false, - "paused": false, - "params": { - "filename": "AnimateDiff_00001.mp4", - "subfolder": "", - "type": "temp", - "format": "video/h264-mp4", - "frame_rate": 8 - } - } - } - }, { "id": 1, "type": "DownloadAndLoadCogVideoModel", "pos": { "0": 606, - "1": 85, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 85 }, "size": { "0": 315, - "1": 154 + "1": 194 }, "flags": {}, "order": 2, "mode": 0, - "inputs": [], + "inputs": [ + { + "name": "pab_config", + "type": "PAB_CONFIG", + "link": null + }, + { + "name": "block_edit", + "type": "TRANSFORMERBLOCKS", + "link": null + }, + { + "name": "lora", + "type": "COGLORA", + "link": null + } + ], "outputs": [ { "name": "cogvideo_pipe", @@ -632,20 +503,215 @@ false ] }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": { + "0": 1097, + "1": 681 + }, + "size": { + "0": 301.1664123535156, + "1": 198 + }, + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 166 + }, + { + "name": "samples", + "type": "LATENT", + "link": 167 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 118 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + }, + "widgets_values": [ + false, + 96, + 96, + 0.083, + 0.083, + true + ] + }, + { + "id": 37, + "type": "CogVideoImageEncode", + "pos": { + "0": 975, + "1": -73 + }, + "size": { + "0": 210, + "1": 122 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 83, + "slot_index": 0 + }, + { + "name": "image", + "type": "IMAGE", + "link": 129, + "slot_index": 1 + }, + { + "name": "mask", + "type": "MASK", + "link": null + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [ + 172 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoImageEncode" + }, + "widgets_values": [ + 8, + true + ] + }, + { + "id": 55, + "type": "GetImageSizeAndCount", + "pos": { + "0": 1195, + "1": 154 + }, + "size": { + "0": 210, + "1": 86 + }, + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 118, + "slot_index": 0 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 170 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "720 width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "480 height", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "33 count", + "type": "INT", + "links": [], + "slot_index": 3, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + }, + "widgets_values": [] + }, + { + "id": 58, + "type": "ImageConcanate", + "pos": { + "0": 1434, + "1": 289 + }, + "size": { + "0": 315, + "1": 102 + }, + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "name": "image1", + "type": "IMAGE", + "link": 191 + }, + { + "name": "image2", + "type": "IMAGE", + "link": 170 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 132 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "ImageConcanate" + }, + "widgets_values": [ + "right", + false + ] + }, { "id": 64, "type": "CogVideoSampler", "pos": { "0": 1090, - "1": 290, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 290 }, "size": { "0": 312.9841613769531, @@ -698,15 +764,6 @@ "widget": { "name": "num_frames" } - }, - { - "name": "t_tile_length", - "type": "INT", - "link": 181, - "slot_index": 7, - "widget": { - "name": "t_tile_length" - } } ], "outputs": [ @@ -738,245 +795,79 @@ 6, 9, "fixed", - "DPM", - "DDIM", - 8, - 0.85 + "CogVideoXDDIM", + 0.8 ] }, { - "id": 11, - "type": "CogVideoDecode", + "id": 47, + "type": "VHS_VideoCombine", "pos": { - "0": 1097, - "1": 681, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "0": 1788, + "1": -364 }, "size": [ - 301.1664045038119, - 198 + 1110, + 687.3333333333333 ], "flags": {}, - "order": 11, + "order": 14, "mode": 0, "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 166 - }, - { - "name": "samples", - "type": "LATENT", - "link": 167 - } - ], - "outputs": [ { "name": "images", "type": "IMAGE", - "links": [ - 118 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - }, - "widgets_values": [ - false, - 96, - 96, - 0.083, - 0.083, - true - ] - }, - { - "id": 37, - "type": "CogVideoImageEncode", - "pos": { - "0": 975, - "1": -73, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": { - "0": 210, - "1": 122 - }, - "flags": {}, - "order": 9, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 83, - "slot_index": 0 + "link": 132 }, { - "name": "image", - "type": "IMAGE", - "link": 129, - "slot_index": 1 + "name": "audio", + "type": "VHS_AUDIO", + "link": null }, { - "name": "mask", - "type": "MASK", + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", "link": null } ], "outputs": [ { - "name": "samples", - "type": "LATENT", - "links": [ - 172 - ], - "slot_index": 0, + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, "shape": 3 } ], "properties": { - "Node name for S&R": "CogVideoImageEncode" + "Node name for S&R": "VHS_VideoCombine" }, - "widgets_values": [ - 8, - true - ] - }, - { - "id": 55, - "type": "GetImageSizeAndCount", - "pos": { - "0": 1195, - "1": 154, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": { - "0": 210, - "1": 86 - }, - "flags": {}, - "order": 12, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 118, - "slot_index": 0 + "widgets_values": { + "frame_rate": 8, + "loop_count": 0, + "filename_prefix": "CogVideoX_vid2vid", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "CogVideoX_vid2vid_00001.mp4", + "subfolder": "", + "type": "temp", + "format": "video/h264-mp4", + "frame_rate": 8 + } } - ], - "outputs": [ - { - "name": "image", - "type": "IMAGE", - "links": [ - 170 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "720 width", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "480 height", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "32 count", - "type": "INT", - "links": [], - "slot_index": 3, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "GetImageSizeAndCount" } - }, - { - "id": 58, - "type": "ImageConcanate", - "pos": { - "0": 1434, - "1": 289, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": { - "0": 315, - "1": 102 - }, - "flags": {}, - "order": 13, - "mode": 0, - "inputs": [ - { - "name": "image1", - "type": "IMAGE", - "link": 191 - }, - { - "name": "image2", - "type": "IMAGE", - "link": 170 - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 132 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "ImageConcanate" - }, - "widgets_values": [ - "right", - false - ] } ], "links": [ @@ -1140,14 +1031,6 @@ 0, "IMAGE" ], - [ - 181, - 57, - 3, - 64, - 7, - "INT" - ], [ 191, 57, @@ -1161,10 +1044,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.7513148009015777, + "scale": 0.6934334949442514, "offset": [ - 280.8935954961883, - 403.945992992638 + 216.19566166079386, + 455.16205928476876 ] } }, diff --git a/examples/cogvideox_5b_example_01.json b/examples/cogvideox_5b_example_01.json index 6c10fcb..38f089e 100644 --- a/examples/cogvideox_5b_example_01.json +++ b/examples/cogvideox_5b_example_01.json @@ -7,19 +7,11 @@ "type": "CogVideoTextEncode", "pos": { "0": 503, - "1": 521, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 521 }, "size": { "0": 463.01251220703125, - "1": 98.10446166992188 + "1": 124 }, "flags": {}, "order": 3, @@ -46,7 +38,9 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "" + "", + 1, + true ] }, { @@ -54,15 +48,7 @@ "type": "CogVideoTextEncode", "pos": { "0": 500, - "1": 308, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 308 }, "size": { "0": 471.90142822265625, @@ -93,86 +79,9 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n" - ] - }, - { - "id": 34, - "type": "CogVideoSampler", - "pos": { - "0": 1041, - "1": 342, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 - }, - "size": { - "0": 315.8404846191406, - "1": 358 - }, - "flags": {}, - "order": 4, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 60 - }, - { - "name": "positive", - "type": "CONDITIONING", - "link": 61 - }, - { - "name": "negative", - "type": "CONDITIONING", - "link": 62 - }, - { - "name": "samples", - "type": "LATENT", - "link": null - } - ], - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 63 - ], - "shape": 3 - }, - { - "name": "samples", - "type": "LATENT", - "links": [ - 64 - ], - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoSampler" - }, - "widgets_values": [ - 480, - 720, - 49, - 50, - 6, - 806286757407563, - "fixed", - "DPM", - 49, - 8, - 1 + "A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n", + 1, + true ] }, { @@ -180,15 +89,7 @@ "type": "VHS_VideoCombine", "pos": { "0": 1441, - "1": 129, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 129 }, "size": [ 778.7022705078125, @@ -259,15 +160,7 @@ "type": "CLIPLoader", "pos": { "0": -26, - "1": 400, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 400 }, "size": { "0": 451.30548095703125, @@ -302,24 +195,32 @@ "type": "DownloadAndLoadCogVideoModel", "pos": { "0": 642, - "1": 90, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 90 }, "size": { "0": 315, - "1": 154 + "1": 194 }, "flags": {}, "order": 1, "mode": 0, - "inputs": [], + "inputs": [ + { + "name": "pab_config", + "type": "PAB_CONFIG", + "link": null + }, + { + "name": "block_edit", + "type": "TRANSFORMERBLOCKS", + "link": null + }, + { + "name": "lora", + "type": "COGLORA", + "link": null + } + ], "outputs": [ { "name": "cogvideo_pipe", @@ -347,20 +248,12 @@ "type": "CogVideoDecode", "pos": { "0": 1051, - "1": 748, - "2": 0, - "3": 0, - "4": 0, - "5": 0, - "6": 0, - "7": 0, - "8": 0, - "9": 0 + "1": 748 + }, + "size": { + "0": 300.396484375, + "1": 198 }, - "size": [ - 300.3964783563508, - 198 - ], "flags": {}, "order": 5, "mode": 0, @@ -398,6 +291,85 @@ 0.2, true ] + }, + { + "id": 34, + "type": "CogVideoSampler", + "pos": { + "0": 1041, + "1": 342 + }, + "size": { + "0": 315.8404846191406, + "1": 358 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 60 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 61 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 62 + }, + { + "name": "samples", + "type": "LATENT", + "link": null + }, + { + "name": "image_cond_latents", + "type": "LATENT", + "link": null + }, + { + "name": "context_options", + "type": "COGCONTEXT", + "link": null + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 63 + ], + "shape": 3 + }, + { + "name": "samples", + "type": "LATENT", + "links": [ + 64 + ], + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoSampler" + }, + "widgets_values": [ + 480, + 720, + 49, + 50, + 6, + 806286757407563, + "fixed", + "DPM", + 1 + ] } ], "links": [ @@ -470,10 +442,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.8264462809917354, + "scale": 0.6934334949442514, "offset": [ - 161.910286780368, - 124.7586178095323 + -24.154349208343916, + 155.20539218330134 ] } }, diff --git a/examples/cogvideox_I2V_example_01.json b/examples/cogvideox_I2V_example_01.json index ee18e70..b6cefa3 100644 --- a/examples/cogvideox_I2V_example_01.json +++ b/examples/cogvideox_I2V_example_01.json @@ -46,7 +46,7 @@ }, "size": { "0": 463.01251220703125, - "1": 98.10446166992188 + "1": 124 }, "flags": {}, "order": 4, @@ -73,7 +73,9 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. " + "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ", + 1, + true ] }, { @@ -112,122 +114,9 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees" - ] - }, - { - "id": 1, - "type": "DownloadAndLoadCogVideoModel", - "pos": { - "0": 642, - "1": 90 - }, - "size": { - "0": 337.8885192871094, - "1": 154 - }, - "flags": {}, - "order": 1, - "mode": 0, - "inputs": [], - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 121, - 124 - ], - "slot_index": 0, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoModel" - }, - "widgets_values": [ - "THUDM/CogVideoX-5b-I2V", - "bf16", - "disabled", - "disabled", - false - ] - }, - { - "id": 57, - "type": "CogVideoSampler", - "pos": { - "0": 1138, - "1": 150 - }, - "size": { - "0": 405.5999755859375, - "1": 378 - }, - "flags": {}, - "order": 7, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 121 - }, - { - "name": "positive", - "type": "CONDITIONING", - "link": 122 - }, - { - "name": "negative", - "type": "CONDITIONING", - "link": 123 - }, - { - "name": "samples", - "type": "LATENT", - "link": null - }, - { - "name": "image_cond_latents", - "type": "LATENT", - "link": 129 - } - ], - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 128 - ], - "slot_index": 0, - "shape": 3 - }, - { - "name": "samples", - "type": "LATENT", - "links": [ - 127 - ], - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "CogVideoSampler" - }, - "widgets_values": [ - 480, - 720, - 49, - 50, - 6, - 65334758276105, - "fixed", - "DPM", - 16, - 8, - 1 + "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees", + 1, + true ] }, { @@ -486,12 +375,12 @@ "0": 365, "1": 685 }, - "size": [ - 402.0635467506413, - 396.62260382077534 - ], + "size": { + "0": 402.06353759765625, + "1": 396.6225891113281 + }, "flags": {}, - "order": 2, + "order": 1, "mode": 0, "inputs": [], "outputs": [ @@ -518,6 +407,140 @@ "sd3stag.png", "image" ] + }, + { + "id": 57, + "type": "CogVideoSampler", + "pos": { + "0": 1138, + "1": 150 + }, + "size": [ + 399.878095897654, + 350 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 121 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 122 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 123 + }, + { + "name": "samples", + "type": "LATENT", + "link": null + }, + { + "name": "image_cond_latents", + "type": "LATENT", + "link": 129 + }, + { + "name": "context_options", + "type": "COGCONTEXT", + "link": null + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 128 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "samples", + "type": "LATENT", + "links": [ + 127 + ], + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoSampler" + }, + "widgets_values": [ + 480, + 720, + 49, + 20, + 6, + 65334758276105, + "fixed", + "CogVideoXDPMScheduler", + 1 + ] + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": { + "0": 633, + "1": 44 + }, + "size": { + "0": 337.8885192871094, + "1": 194 + }, + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "pab_config", + "type": "PAB_CONFIG", + "link": null + }, + { + "name": "block_edit", + "type": "TRANSFORMERBLOCKS", + "link": null + }, + { + "name": "lora", + "type": "COGLORA", + "link": null + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 121, + 124 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "THUDM/CogVideoX-5b-I2V", + "bf16", + "disabled", + "disabled", + false + ] } ], "links": [ @@ -622,10 +645,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.7513148009015778, + "scale": 0.6934334949442514, "offset": [ - 153.9884532493587, - 111.86149660036742 + -24.154349208343916, + 155.20539218330134 ] } }, diff --git a/nodes.py b/nodes.py index f232dba..ac08b3f 100644 --- a/nodes.py +++ b/nodes.py @@ -36,7 +36,6 @@ scheduler_mapping = { "Euler A": EulerAncestralDiscreteScheduler, "PNDM": PNDMScheduler, "DDIM": DDIMScheduler, - "DDIM_tiled": CogVideoXDDIMScheduler, "CogVideoXDDIM": CogVideoXDDIMScheduler, "CogVideoXDPMScheduler": CogVideoXDPMScheduler, "SASolverScheduler": SASolverScheduler, @@ -1292,7 +1291,7 @@ class CogVideoContextOptions: def INPUT_TYPES(s): return {"required": { "context_schedule": (["uniform_standard", "uniform_looped", "static_standard", "temporal_tiling"],), - "context_frames": ("INT", {"default": 12, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ), + "context_frames": ("INT", {"default": 48, "min": 2, "max": 100, "step": 1, "tooltip": "Number of pixel frames in the context, NOTE: the latent space has 4 frames in 1"} ), "context_stride": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context stride as pixel frames, NOTE: the latent space has 4 frames in 1"} ), "context_overlap": ("INT", {"default": 4, "min": 4, "max": 100, "step": 1, "tooltip": "Context overlap as pixel frames, NOTE: the latent space has 4 frames in 1"} ), "freenoise": ("BOOLEAN", {"default": True, "tooltip": "Shuffle the noise"}),