diff --git a/examples/example_01.json b/examples/example_01.json index 49da49d..f4ee81f 100644 --- a/examples/example_01.json +++ b/examples/example_01.json @@ -1,145 +1,37 @@ { - "last_node_id": 12, - "last_link_id": 23, + "last_node_id": 31, + "last_link_id": 57, "nodes": [ { - "id": 11, - "type": "CogVideoDecode", - "pos": [ - 1301, - 352 - ], - "size": { - "0": 210, - "1": 46 - }, - "flags": {}, - "order": 3, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 21 - }, - { - "name": "samples", - "type": "LATENT", - "link": 22 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 23 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - } - }, - { - "id": 2, - "type": "CogVideoEncodePrompt", - "pos": [ - 459, - 485 - ], - "size": [ - 408.03107827615304, - 315.59645204258936 - ], - "flags": {}, - "order": 1, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 1 - } - ], - "outputs": [ - { - "name": "embeds", - "type": "COGEMBEDS", - "links": [ - 16 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoEncodePrompt" - }, - "widgets_values": [ - "A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.", - "" - ] - }, - { - "id": 1, - "type": "DownloadAndLoadCogVideoModel", - "pos": [ - 460, - 354 - ], - "size": { - "0": 315, - "1": 58 - }, - "flags": {}, - "order": 0, - "mode": 0, - "outputs": [ - { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", - "links": [ - 1, - 15 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "DownloadAndLoadCogVideoModel" - }, - "widgets_values": [ - "fp16" - ] - }, - { - "id": 10, + "id": 22, "type": "CogVideoSampler", "pos": [ - 920, - 353 + 1041, + 342 ], "size": { "0": 315, - "1": 246 + "1": 266 }, "flags": {}, - "order": 2, + "order": 4, "mode": 0, "inputs": [ { "name": "pipeline", "type": "COGVIDEOPIPE", - "link": 15 + "link": 36 }, { - "name": "embeds", - "type": "COGEMBEDS", - "link": 16 + "name": "positive", + "type": "CONDITIONING", + "link": 55, + "slot_index": 1 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 57 } ], "outputs": [ @@ -147,7 +39,7 @@ "name": "cogvideo_pipe", "type": "COGVIDEOPIPE", "links": [ - 21 + 37 ], "shape": 3 }, @@ -155,7 +47,7 @@ "name": "samples", "type": "LATENT", "links": [ - 22 + 38 ], "shape": 3 } @@ -166,33 +58,75 @@ "widgets_values": [ 480, 720, - 48, + 16, 8, - 30, + 25, 6, - 867121661458558, + 806286757407561, "fixed" ] }, { - "id": 12, + "id": 11, + "type": "CogVideoDecode", + "pos": [ + 1142, + 658 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 37 + }, + { + "name": "samples", + "type": "LATENT", + "link": 38 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 51 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + } + }, + { + "id": 28, "type": "VHS_VideoCombine", "pos": [ - 1563, - 353 + 1432, + 150 ], "size": [ - 315, - 520.6666666666666 + 667.752197265625, + 755.8347981770833 ], "flags": {}, - "order": 4, + "order": 6, "mode": 0, "inputs": [ { "name": "images", "type": "IMAGE", - "link": 23 + "link": 51, + "slot_index": 0 }, { "name": "audio", @@ -235,7 +169,7 @@ "hidden": false, "paused": false, "params": { - "filename": "AnimateDiff_00003.mp4", + "filename": "AnimateDiff_00001.mp4", "subfolder": "", "type": "temp", "format": "video/h264-mp4", @@ -243,66 +177,226 @@ } } } + }, + { + "id": 30, + "type": "CogVideoTextEncode", + "pos": [ + 500, + 308 + ], + "size": [ + 474.84501511852204, + 164.74235966960538 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 54 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 55 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance." + ] + }, + { + "id": 20, + "type": "CLIPLoader", + "pos": [ + -59, + 397 + ], + "size": { + "0": 451.30548095703125, + "1": 82 + }, + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 54, + 56 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", + "sd3" + ] + }, + { + "id": 31, + "type": "CogVideoTextEncode", + "pos": [ + 503, + 521 + ], + "size": [ + 463.01251866466464, + 98.10446321574796 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 56 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 57 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "" + ] + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": [ + 649, + 182 + ], + "size": { + "0": 315, + "1": 58 + }, + "flags": {}, + "order": 1, + "mode": 0, + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 36 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "fp16" + ] } ], "links": [ [ - 1, + 36, 1, 0, - 2, + 22, 0, "COGVIDEOPIPE" ], [ - 15, - 1, - 0, - 10, - 0, - "COGVIDEOPIPE" - ], - [ - 16, - 2, - 0, - 10, - 1, - "COGEMBEDS" - ], - [ - 21, - 10, + 37, + 22, 0, 11, 0, "COGVIDEOPIPE" ], [ + 38, 22, - 10, 1, 11, 1, "LATENT" ], [ - 23, + 51, 11, 0, - 12, + 28, 0, "IMAGE" + ], + [ + 54, + 20, + 0, + 30, + 0, + "CLIP" + ], + [ + 55, + 30, + 0, + 22, + 1, + "CONDITIONING" + ], + [ + 56, + 20, + 0, + 31, + 0, + "CLIP" + ], + [ + 57, + 31, + 0, + 22, + 2, + "CONDITIONING" ] ], "groups": [], "config": {}, "extra": { "ds": { - "scale": 1, + "scale": 0.6830134553650706, "offset": [ - -281.3644522995906, - -67.92982606602688 + 359.4381777891929, + 334.95283678425216 ] } }, diff --git a/nodes.py b/nodes.py index 9fefeb5..e76663f 100644 --- a/nodes.py +++ b/nodes.py @@ -48,12 +48,13 @@ class DownloadAndLoadCogVideoModel: snapshot_download( repo_id="THUDM/CogVideoX-2b", - #ignore_patterns=["*sd-image-variations-encoder-fp16.safetensors", "fye_motion_module-fp16.safetensors"], + ignore_patterns=["*text_encoder*"], local_dir=base_path, local_dir_use_symlinks=False, ) pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device) + pipeline = { "pipe": pipe, @@ -72,8 +73,8 @@ class CogVideoEncodePrompt: } } - RETURN_TYPES = ("COGEMBEDS",) - RETURN_NAMES = ("embeds",) + RETURN_TYPES = ("CONDITIONING", "CONDITIONING") + RETURN_NAMES = ("positive", "negative") FUNCTION = "process" CATEGORY = "CogVideoWrapper" @@ -86,7 +87,7 @@ class CogVideoEncodePrompt: pipe.text_encoder.to(device) pipe.transformer.to(offload_device) - pos_embeds, neg_embeds = pipe.encode_prompt( + positive, negative = pipe.encode_prompt( prompt=prompt, negative_prompt=negative_prompt, do_classifier_free_guidance=True, @@ -96,11 +97,30 @@ class CogVideoEncodePrompt: dtype=dtype, ) pipe.text_encoder.to(offload_device) - embeds = { - "positive": pos_embeds, - "negative": neg_embeds, + + return (positive, negative) + +class CogVideoTextEncode: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "clip": ("CLIP",), + "prompt": ("STRING", {"default": "", "multiline": True} ), + } } + RETURN_TYPES = ("CONDITIONING",) + RETURN_NAMES = ("conditioning",) + FUNCTION = "process" + CATEGORY = "CogVideoWrapper" + + def process(self, clip, prompt): + clip.tokenizer.t5xxl.pad_to_max_length = True + clip.tokenizer.t5xxl.max_length = 226 + tokens = clip.tokenize(prompt, return_word_ids=True) + + embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False) + return (embeds, ) class CogVideoSampler: @@ -108,7 +128,8 @@ class CogVideoSampler: def INPUT_TYPES(s): return {"required": { "pipeline": ("COGVIDEOPIPE",), - "embeds": ("COGEMBEDS", ), + "positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}), "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}), "num_frames": ("INT", {"default": 48, "min": 1, "max": 100, "step": 1}), @@ -124,11 +145,12 @@ class CogVideoSampler: FUNCTION = "process" CATEGORY = "CogVideoWrapper" - def process(self, pipeline, embeds, fps, steps, cfg, seed, height, width, num_frames): + def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames): mm.soft_empty_cache() device = mm.get_torch_device() offload_device = mm.unet_offload_device() pipe = pipeline["pipe"] + dtype = pipeline["dtype"] pipe.transformer.to(device) generator = torch.Generator(device=device).manual_seed(seed) @@ -140,8 +162,8 @@ class CogVideoSampler: num_frames = num_frames, fps = fps, guidance_scale=cfg, - prompt_embeds=embeds["positive"], - negative_prompt_embeds=embeds["negative"], + prompt_embeds=positive.to(dtype).to(device), + negative_prompt_embeds=negative.to(dtype).to(device), #negative_prompt_embeds=torch.zeros_like(embeds), generator=generator, output_type="latents", @@ -206,12 +228,12 @@ class CogVideoDecode: NODE_CLASS_MAPPINGS = { "DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel, "CogVideoSampler": CogVideoSampler, - "CogVideoEncodePrompt": CogVideoEncodePrompt, - "CogVideoDecode": CogVideoDecode + "CogVideoDecode": CogVideoDecode, + "CogVideoTextEncode": CogVideoTextEncode } NODE_DISPLAY_NAME_MAPPINGS = { - "DownloadAndLoadCogVideoModel": "DownloadAndLoadCogVideoModel", + "DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model", "CogVideoSampler": "CogVideo Sampler", - "CogVideoEncodePrompt": "CogVideo EncodePrompt", "CogVideoDecode": "CogVideo Decode", + "CogVideoTextEncode": "CogVideo TextEncode" } \ No newline at end of file