From 97e89d596e00bbdc97c7860270b5ad719eae2b28 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Wed, 7 Aug 2024 01:10:19 +0300 Subject: [PATCH] update examples, expose scheduler, force T5 offload --- .../cogvideo_vid2vid_test_example_01.json | 675 +++++++++--------- examples/example_01.json | 117 +-- nodes.py | 28 +- pipeline_cogvideox.py | 26 - 4 files changed, 436 insertions(+), 410 deletions(-) diff --git a/examples/cogvideo_vid2vid_test_example_01.json b/examples/cogvideo_vid2vid_test_example_01.json index 19efdd5..93b1532 100644 --- a/examples/cogvideo_vid2vid_test_example_01.json +++ b/examples/cogvideo_vid2vid_test_example_01.json @@ -1,46 +1,7 @@ { - "last_node_id": 59, - "last_link_id": 137, + "last_node_id": 64, + "last_link_id": 167, "nodes": [ - { - "id": 31, - "type": "CogVideoTextEncode", - "pos": [ - 503, - 521 - ], - "size": { - "0": 463.01251220703125, - "1": 98.10446166992188 - }, - "flags": {}, - "order": 4, - "mode": 0, - "inputs": [ - { - "name": "clip", - "type": "CLIP", - "link": 56 - } - ], - "outputs": [ - { - "name": "conditioning", - "type": "CONDITIONING", - "links": [ - 80 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoTextEncode" - }, - "widgets_values": [ - "" - ] - }, { "id": 1, "type": "DownloadAndLoadCogVideoModel", @@ -60,8 +21,8 @@ "name": "cogvideo_pipe", "type": "COGVIDEOPIPE", "links": [ - 78, - 83 + 83, + 159 ], "shape": 3, "slot_index": 0 @@ -108,47 +69,6 @@ "sd3" ] }, - { - "id": 11, - "type": "CogVideoDecode", - "pos": [ - 1199, - 661 - ], - "size": { - "0": 210, - "1": 46 - }, - "flags": {}, - "order": 9, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 81 - }, - { - "name": "samples", - "type": "LATENT", - "link": 82 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 118 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - } - }, { "id": 56, "type": "SimpleMath+", @@ -233,7 +153,7 @@ "name": "samples", "type": "LATENT", "links": [ - 122 + 162 ], "shape": 3, "slot_index": 0 @@ -301,84 +221,6 @@ "Node name for S&R": "GetImageSizeAndCount" } }, - { - "id": 41, - "type": "ImageResizeKJ", - "pos": [ - 315, - -19 - ], - "size": { - "0": 315, - "1": 242 - }, - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [ - { - "name": "image", - "type": "IMAGE", - "link": 128 - }, - { - "name": "get_image_size", - "type": "IMAGE", - "link": null - }, - { - "name": "width_input", - "type": "INT", - "link": null, - "widget": { - "name": "width_input" - } - }, - { - "name": "height_input", - "type": "INT", - "link": null, - "widget": { - "name": "height_input" - } - } - ], - "outputs": [ - { - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 126 - ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "width", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "height", - "type": "INT", - "links": null, - "shape": 3 - } - ], - "properties": { - "Node name for S&R": "ImageResizeKJ" - }, - "widgets_values": [ - 720, - 480, - "lanczos", - false, - 2, - 0, - 0 - ] - }, { "id": 59, "type": "GetImageRangeFromBatch", @@ -448,10 +290,10 @@ 1451, 368 ], - "size": [ - 315, - 102 - ], + "size": { + "0": 315, + "1": 102 + }, "flags": { "collapsed": true }, @@ -552,12 +394,12 @@ "id": 47, "type": "VHS_VideoCombine", "pos": [ - 1789, + 1790, -104 ], "size": [ - 1113.3311767578125, - 712.4437255859375 + 1110, + 711.3333333333333 ], "flags": {}, "order": 15, @@ -610,7 +452,7 @@ "hidden": false, "paused": false, "params": { - "filename": "AnimateDiff_00011.mp4", + "filename": "AnimateDiff_00008.mp4", "subfolder": "", "type": "temp", "format": "video/nvenc_h264-mp4", @@ -619,6 +461,190 @@ } } }, + { + "id": 57, + "type": "GetImageSizeAndCount", + "pos": [ + 674, + 2 + ], + "size": { + "0": 210, + "1": 86 + }, + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 126, + "slot_index": 0 + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 129, + 136 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "720 width", + "type": "INT", + "links": [ + 165 + ], + "shape": 3, + "slot_index": 1 + }, + { + "name": "480 height", + "type": "INT", + "links": [ + 164 + ], + "shape": 3, + "slot_index": 2 + }, + { + "name": "16 count", + "type": "INT", + "links": [ + 163 + ], + "shape": 3, + "slot_index": 3 + } + ], + "properties": { + "Node name for S&R": "GetImageSizeAndCount" + } + }, + { + "id": 41, + "type": "ImageResizeKJ", + "pos": [ + 315, + -19 + ], + "size": { + "0": 315, + "1": 242 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 128 + }, + { + "name": "get_image_size", + "type": "IMAGE", + "link": null + }, + { + "name": "width_input", + "type": "INT", + "link": null, + "widget": { + "name": "width_input" + } + }, + { + "name": "height_input", + "type": "INT", + "link": null, + "widget": { + "name": "height_input" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 126 + ], + "shape": 3, + "slot_index": 0 + }, + { + "name": "width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "height", + "type": "INT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "ImageResizeKJ" + }, + "widgets_values": [ + 720, + 480, + "lanczos", + false, + 2, + 0, + 0 + ] + }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": [ + 1201, + 684 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 166 + }, + { + "name": "samples", + "type": "LATENT", + "link": 167 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 118 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + } + }, { "id": 30, "type": "CogVideoTextEncode", @@ -645,7 +671,7 @@ "name": "conditioning", "type": "CONDITIONING", "links": [ - 79 + 160 ], "shape": 3, "slot_index": 0 @@ -655,84 +681,46 @@ "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - "video of dinosaur turning it's head in a cinematic and dramatic scene from a movie" + "cinematic video of a red panda turning it's head" ] }, { - "id": 36, - "type": "CogVideoSampler", + "id": 31, + "type": "CogVideoTextEncode", "pos": [ - 1093, - 292 - ], - "size": [ - 315, - 310 + 503, + 521 ], + "size": { + "0": 463.01251220703125, + "1": 98.10446166992188 + }, "flags": {}, - "order": 8, + "order": 4, "mode": 0, "inputs": [ { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 78 - }, - { - "name": "positive", - "type": "CONDITIONING", - "link": 79 - }, - { - "name": "negative", - "type": "CONDITIONING", - "link": 80 - }, - { - "name": "samples", - "type": "LATENT", - "link": 122 - }, - { - "name": "num_frames", - "type": "INT", - "link": 137, - "widget": { - "name": "num_frames" - } + "name": "clip", + "type": "CLIP", + "link": 56 } ], "outputs": [ { - "name": "cogvideo_pipe", - "type": "COGVIDEOPIPE", + "name": "conditioning", + "type": "CONDITIONING", "links": [ - 81 + 161 ], - "shape": 3 - }, - { - "name": "samples", - "type": "LATENT", - "links": [ - 82 - ], - "shape": 3 + "shape": 3, + "slot_index": 0 } ], "properties": { - "Node name for S&R": "CogVideoSampler" + "Node name for S&R": "CogVideoTextEncode" }, "widgets_values": [ - 480, - 720, - 16, - 8, - 25, - 8, - 1119546789766856, - "fixed", - 0.8 + "bad quality video, blurry, messy" ] }, { @@ -819,63 +807,98 @@ } }, { - "id": 57, - "type": "GetImageSizeAndCount", + "id": 64, + "type": "CogVideoSampler", "pos": [ - 674, - 2 + 1090, + 290 ], "size": { - "0": 210, - "1": 86 + "0": 315, + "1": 342 }, "flags": {}, - "order": 6, + "order": 8, "mode": 0, "inputs": [ { - "name": "image", - "type": "IMAGE", - "link": 126, - "slot_index": 0 + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 159 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 160 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 161 + }, + { + "name": "samples", + "type": "LATENT", + "link": 162 + }, + { + "name": "num_frames", + "type": "INT", + "link": 163, + "widget": { + "name": "num_frames" + } + }, + { + "name": "height", + "type": "INT", + "link": 164, + "widget": { + "name": "height" + } + }, + { + "name": "width", + "type": "INT", + "link": 165, + "widget": { + "name": "width" + } } ], "outputs": [ { - "name": "image", - "type": "IMAGE", + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", "links": [ - 129, - 136 + 166 ], - "shape": 3, - "slot_index": 0 - }, - { - "name": "720 width", - "type": "INT", - "links": null, "shape": 3 }, { - "name": "480 height", - "type": "INT", - "links": null, - "shape": 3 - }, - { - "name": "16 count", - "type": "INT", + "name": "samples", + "type": "LATENT", "links": [ - 137 + 167 ], - "shape": 3, - "slot_index": 3 + "shape": 3 } ], "properties": { - "Node name for S&R": "GetImageSizeAndCount" - } + "Node name for S&R": "CogVideoSampler" + }, + "widgets_values": [ + 480, + 720, + 48, + 8, + 35, + 9, + 6, + "fixed", + "DPM", + 0.7000000000000001 + ] } ], "links": [ @@ -895,46 +918,6 @@ 0, "CLIP" ], - [ - 78, - 1, - 0, - 36, - 0, - "COGVIDEOPIPE" - ], - [ - 79, - 30, - 0, - 36, - 1, - "CONDITIONING" - ], - [ - 80, - 31, - 0, - 36, - 2, - "CONDITIONING" - ], - [ - 81, - 36, - 0, - 11, - 0, - "COGVIDEOPIPE" - ], - [ - 82, - 36, - 1, - 11, - 1, - "LATENT" - ], [ 83, 1, @@ -975,14 +958,6 @@ 0, "INT,FLOAT" ], - [ - 122, - 37, - 0, - 36, - 3, - "LATENT" - ], [ 126, 41, @@ -1048,22 +1023,86 @@ "IMAGE" ], [ - 137, + 159, + 1, + 0, + 64, + 0, + "COGVIDEOPIPE" + ], + [ + 160, + 30, + 0, + 64, + 1, + "CONDITIONING" + ], + [ + 161, + 31, + 0, + 64, + 2, + "CONDITIONING" + ], + [ + 162, + 37, + 0, + 64, + 3, + "LATENT" + ], + [ + 163, 57, 3, - 36, + 64, 4, "INT" + ], + [ + 164, + 57, + 2, + 64, + 5, + "INT" + ], + [ + 165, + 57, + 1, + 64, + 6, + "INT" + ], + [ + 166, + 64, + 0, + 11, + 0, + "COGVIDEOPIPE" + ], + [ + 167, + 64, + 1, + 11, + 1, + "LATENT" ] ], "groups": [], "config": {}, "extra": { "ds": { - "scale": 0.7513148009015777, + "scale": 0.6830134553650705, "offset": [ - 45.633655208726886, - 389.8041242612087 + 56.628416841109384, + 394.7727729054069 ] } }, diff --git a/examples/example_01.json b/examples/example_01.json index f707db7..1881508 100644 --- a/examples/example_01.json +++ b/examples/example_01.json @@ -11,7 +11,7 @@ ], "size": { "0": 315, - "1": 266 + "1": 334 }, "flags": {}, "order": 4, @@ -32,6 +32,11 @@ "name": "negative", "type": "CONDITIONING", "link": 57 + }, + { + "name": "samples", + "type": "LATENT", + "link": null } ], "outputs": [ @@ -63,50 +68,11 @@ 25, 6, 806286757407561, - "fixed" + "fixed", + "DDIM", + 1 ] }, - { - "id": 11, - "type": "CogVideoDecode", - "pos": [ - 1142, - 658 - ], - "size": { - "0": 210, - "1": 46 - }, - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [ - { - "name": "pipeline", - "type": "COGVIDEOPIPE", - "link": 37 - }, - { - "name": "samples", - "type": "LATENT", - "link": 38 - } - ], - "outputs": [ - { - "name": "images", - "type": "IMAGE", - "links": [ - 51 - ], - "shape": 3, - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "CogVideoDecode" - } - }, { "id": 28, "type": "VHS_VideoCombine", @@ -169,7 +135,7 @@ "hidden": false, "paused": false, "params": { - "filename": "AnimateDiff_00001.mp4", + "filename": "CogVideoX_00001.mp4", "subfolder": "", "type": "temp", "format": "video/h264-mp4", @@ -185,10 +151,10 @@ 500, 308 ], - "size": [ - 474.84501511852204, - 164.74235966960538 - ], + "size": { + "0": 474.8450012207031, + "1": 164.7423553466797 + }, "flags": {}, "order": 2, "mode": 0, @@ -258,10 +224,10 @@ 503, 521 ], - "size": [ - 463.01251866466464, - 98.10446321574796 - ], + "size": { + "0": 463.01251220703125, + "1": 98.10446166992188 + }, "flags": {}, "order": 3, "mode": 0, @@ -321,6 +287,47 @@ "widgets_values": [ "fp16" ] + }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": [ + 1138, + 725 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 37 + }, + { + "name": "samples", + "type": "LATENT", + "link": 38 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 51 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + } } ], "links": [ @@ -393,10 +400,10 @@ "config": {}, "extra": { "ds": { - "scale": 0.6830134553650706, + "scale": 0.9090909090909092, "offset": [ - 359.4381777891929, - 334.95283678425216 + 12.99028921497383, + 38.21608107136124 ] } }, diff --git a/nodes.py b/nodes.py index 584fc17..f562580 100644 --- a/nodes.py +++ b/nodes.py @@ -2,7 +2,7 @@ import os import torch import folder_paths import comfy.model_management as mm - +from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler from .pipeline_cogvideox import CogVideoXPipeline import logging @@ -54,11 +54,11 @@ class DownloadAndLoadCogVideoModel: ) pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device) - pipeline = { "pipe": pipe, - "dtype": dtype + "dtype": dtype, + "base_path": base_path } return (pipeline,) @@ -115,11 +115,15 @@ class CogVideoTextEncode: CATEGORY = "CogVideoWrapper" def process(self, clip, prompt): + load_device = mm.text_encoder_device() + offload_device = mm.text_encoder_offload_device() clip.tokenizer.t5xxl.pad_to_max_length = True clip.tokenizer.t5xxl.max_length = 226 + clip.cond_stage_model.to(load_device) tokens = clip.tokenize(prompt, return_word_ids=True) embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False) + clip.cond_stage_model.to(offload_device) return (embeds, ) @@ -194,6 +198,7 @@ class CogVideoSampler: "steps": ("INT", {"default": 25, "min": 1}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), + "scheduler": (["DDIM", "DPM"],), }, "optional": { "samples": ("LATENT", ), @@ -206,16 +211,22 @@ class CogVideoSampler: FUNCTION = "process" CATEGORY = "CogVideoWrapper" - def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, samples=None, denoise_strength=1.0): + def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0): mm.soft_empty_cache() device = mm.get_torch_device() offload_device = mm.unet_offload_device() pipe = pipeline["pipe"] dtype = pipeline["dtype"] + base_path = pipeline["base_path"] pipe.transformer.to(device) generator = torch.Generator(device=device).manual_seed(seed) + if scheduler == "DDIM": + pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler") + elif scheduler == "DPM": + pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler") + latents = pipeline["pipe"]( num_inference_steps=steps, height = height, @@ -227,7 +238,6 @@ class CogVideoSampler: denoise_strength=denoise_strength, prompt_embeds=positive.to(dtype).to(device), negative_prompt_embeds=negative.to(dtype).to(device), - #negative_prompt_embeds=torch.zeros_like(embeds), generator=generator, output_type="latents", device=device @@ -264,11 +274,10 @@ class CogVideoDecode: if "num_frames" in pipeline: num_frames = pipeline["num_frames"] fps = pipeline["fps"] - - else: num_frames = latents.shape[2] fps = 8 + num_seconds = num_frames // fps latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width] latents = 1 / vae.config.scaling_factor * latents @@ -278,17 +287,14 @@ class CogVideoDecode: # Whether or not to clear fake context parallel cache fake_cp = i + 1 < num_seconds start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3) - current_frames = vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample frames.append(current_frames) + mm.soft_empty_cache() vae.to(offload_device) frames = torch.cat(frames, dim=2) - print(frames.min(), frames.max()) video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt") - print(video.shape) video = video[0].permute(0, 2, 3, 1).cpu().float() - print(video.min(), video.max()) return (video,) diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index 4383322..7faa0d3 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -222,22 +222,6 @@ class CogVideoXPipeline(DiffusionPipeline): latents = latents * self.scheduler.init_noise_sigma return latents, timesteps - def decode_latents(self, latents: torch.Tensor, num_seconds: int): - latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width] - latents = 1 / self.vae.config.scaling_factor * latents - - frames = [] - for i in range(num_seconds): - # Whether or not to clear fake context parallel cache - fake_cp = i + 1 < num_seconds - start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3) - - current_frames = self.vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample - frames.append(current_frames) - - frames = torch.cat(frames, dim=2) - return frames - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature @@ -534,17 +518,7 @@ class CogVideoXPipeline(DiffusionPipeline): progress_bar.update() comfy_pbar.update(1) - if not output_type == "latents": - video = self.decode_latents(latents, num_frames // fps) - video = self.video_processor.postprocess_video(video=video, output_type=output_type) - else: - video = latents - print(video.shape) - # Offload all models self.maybe_free_model_hooks() - if not return_dict: - return (video,) return latents - #return CogVideoXPipelineOutput(frames=video)