diff --git a/examples/cogvideox_interpolation_example_01.json b/examples/cogvideox_interpolation_example_01.json new file mode 100644 index 0000000..8198543 --- /dev/null +++ b/examples/cogvideox_interpolation_example_01.json @@ -0,0 +1,831 @@ +{ + "last_node_id": 67, + "last_link_id": 152, + "nodes": [ + { + "id": 20, + "type": "CLIPLoader", + "pos": { + "0": -26, + "1": 400 + }, + "size": { + "0": 451.30548095703125, + "1": 82 + }, + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 54, + 56 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", + "sd3" + ] + }, + { + "id": 31, + "type": "CogVideoTextEncode", + "pos": { + "0": 497, + "1": 520 + }, + "size": { + "0": 463.01251220703125, + "1": 124 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 56 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 123 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ", + 1, + true + ] + }, + { + "id": 30, + "type": "CogVideoTextEncode", + "pos": { + "0": 493, + "1": 303 + }, + "size": { + "0": 471.90142822265625, + "1": 168.08047485351562 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 54 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 122 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees", + 1, + true + ] + }, + { + "id": 57, + "type": "CogVideoSampler", + "pos": { + "0": 1138, + "1": 150 + }, + "size": { + "0": 399.8780822753906, + "1": 370 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 121 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 122 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 123 + }, + { + "name": "samples", + "type": "LATENT", + "link": null, + "shape": 7 + }, + { + "name": "image_cond_latents", + "type": "LATENT", + "link": 146, + "shape": 7 + }, + { + "name": "context_options", + "type": "COGCONTEXT", + "link": null, + "shape": 7 + }, + { + "name": "controlnet", + "type": "COGVIDECONTROLNET", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 128 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "samples", + "type": "LATENT", + "links": [ + 127 + ], + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoSampler" + }, + "widgets_values": [ + 480, + 720, + 49, + 20, + 6, + 65334758276105, + "fixed", + "CogVideoXDPMScheduler", + 1 + ] + }, + { + "id": 1, + "type": "DownloadAndLoadCogVideoModel", + "pos": { + "0": 633, + "1": 44 + }, + "size": { + "0": 337.8885192871094, + "1": 194 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "pab_config", + "type": "PAB_CONFIG", + "link": null, + "shape": 7 + }, + { + "name": "block_edit", + "type": "TRANSFORMERBLOCKS", + "link": null, + "shape": 7 + }, + { + "name": "lora", + "type": "COGLORA", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 121, + 149 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "feizhengcong/CogvideoX-Interpolation", + "bf16", + "disabled", + "disabled", + false + ] + }, + { + "id": 65, + "type": "CogVideoImageInterpolationEncode", + "pos": { + "0": 1123, + "1": 647 + }, + "size": [ + 331.6177535935244, + 118 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 149 + }, + { + "name": "start_image", + "type": "IMAGE", + "link": 147 + }, + { + "name": "end_image", + "type": "IMAGE", + "link": 152 + }, + { + "name": "mask", + "type": "MASK", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [ + 146 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoImageInterpolationEncode" + }, + "widgets_values": [ + false + ] + }, + { + "id": 44, + "type": "VHS_VideoCombine", + "pos": { + "0": 1927, + "1": 146 + }, + "size": [ + 605.3909912109375, + 714.2606608072917 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 118 + }, + { + "name": "audio", + "type": "AUDIO", + "link": null, + "shape": 7 + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null, + "shape": 7 + }, + { + "name": "vae", + "type": "VAE", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 8, + "loop_count": 0, + "filename_prefix": "CogVideoX_interpolation", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "CogVideoX-I2V_00001.mp4", + "subfolder": "", + "type": "temp", + "format": "video/h264-mp4", + "frame_rate": 8 + }, + "muted": false + } + } + }, + { + "id": 67, + "type": "ImageResizeKJ", + "pos": { + "0": 569, + "1": 1173 + }, + "size": [ + 315, + 266 + ], + "flags": { + "collapsed": true + }, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 151 + }, + { + "name": "get_image_size", + "type": "IMAGE", + "link": 150, + "shape": 7 + }, + { + "name": "width_input", + "type": "INT", + "link": null, + "widget": { + "name": "width_input" + }, + "shape": 7 + }, + { + "name": "height_input", + "type": "INT", + "link": null, + "widget": { + "name": "height_input" + }, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 152 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "width", + "type": "INT", + "links": [], + "slot_index": 1, + "shape": 3 + }, + { + "name": "height", + "type": "INT", + "links": [], + "slot_index": 2, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "ImageResizeKJ" + }, + "widgets_values": [ + 720, + 480, + "lanczos", + false, + 16, + 0, + 0, + "disabled" + ] + }, + { + "id": 37, + "type": "ImageResizeKJ", + "pos": { + "0": 537, + "1": 722 + }, + "size": { + "0": 315, + "1": 266 + }, + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 71 + }, + { + "name": "get_image_size", + "type": "IMAGE", + "link": null, + "shape": 7 + }, + { + "name": "width_input", + "type": "INT", + "link": null, + "widget": { + "name": "width_input" + } + }, + { + "name": "height_input", + "type": "INT", + "link": null, + "widget": { + "name": "height_input" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 147, + 150 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "width", + "type": "INT", + "links": [], + "slot_index": 1, + "shape": 3 + }, + { + "name": "height", + "type": "INT", + "links": [], + "slot_index": 2, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "ImageResizeKJ" + }, + "widgets_values": [ + 720, + 480, + "lanczos", + false, + 16, + 0, + 0, + "disabled" + ] + }, + { + "id": 36, + "type": "LoadImage", + "pos": { + "0": 20, + "1": 674 + }, + "size": { + "0": 402.06353759765625, + "1": 396.6225891113281 + }, + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 71 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "MASK", + "type": "MASK", + "links": null, + "shape": 3 + } + ], + "title": "Load Image: Start", + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "sd3stag.png", + "image" + ] + }, + { + "id": 66, + "type": "LoadImage", + "pos": { + "0": 20, + "1": 1121 + }, + "size": { + "0": 402.06353759765625, + "1": 396.6225891113281 + }, + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 151 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "MASK", + "type": "MASK", + "links": null, + "shape": 3 + } + ], + "title": "Load Image: End", + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "sd3stag.png", + "image" + ] + }, + { + "id": 56, + "type": "CogVideoDecode", + "pos": { + "0": 1581, + "1": 148 + }, + "size": { + "0": 300.396484375, + "1": 198 + }, + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 128 + }, + { + "name": "samples", + "type": "LATENT", + "link": 127 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 118 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + }, + "widgets_values": [ + true, + 240, + 360, + 0.2, + 0.2, + true + ] + } + ], + "links": [ + [ + 54, + 20, + 0, + 30, + 0, + "CLIP" + ], + [ + 56, + 20, + 0, + 31, + 0, + "CLIP" + ], + [ + 71, + 36, + 0, + 37, + 0, + "IMAGE" + ], + [ + 118, + 56, + 0, + 44, + 0, + "IMAGE" + ], + [ + 121, + 1, + 0, + 57, + 0, + "COGVIDEOPIPE" + ], + [ + 122, + 30, + 0, + 57, + 1, + "CONDITIONING" + ], + [ + 123, + 31, + 0, + 57, + 2, + "CONDITIONING" + ], + [ + 127, + 57, + 1, + 56, + 1, + "LATENT" + ], + [ + 128, + 57, + 0, + 56, + 0, + "COGVIDEOPIPE" + ], + [ + 146, + 65, + 0, + 57, + 4, + "LATENT" + ], + [ + 147, + 37, + 0, + 65, + 1, + "IMAGE" + ], + [ + 149, + 1, + 0, + 65, + 0, + "COGVIDEOPIPE" + ], + [ + 150, + 37, + 0, + 67, + 1, + "IMAGE" + ], + [ + 151, + 66, + 0, + 67, + 0, + "IMAGE" + ], + [ + 152, + 67, + 0, + 65, + 2, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.693433494944327, + "offset": [ + 225.6761629383604, + -15.041612364034256 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/nodes.py b/nodes.py index 59fce7d..9d1dc96 100644 --- a/nodes.py +++ b/nodes.py @@ -258,6 +258,7 @@ class DownloadAndLoadCogVideoModel: "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP", "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose", "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose", + "feizhengcong/CogvideoX-Interpolation", ], ), @@ -313,14 +314,15 @@ class DownloadAndLoadCogVideoModel: base_path = os.path.join(download_path, "CogVideo2B") download_path = base_path repo_id = model - elif "5b" in model: + else: base_path = os.path.join(download_path, (model.split("/")[-1])) download_path = base_path repo_id = model + if "2b" in model: scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json') - elif "5b" in model: + else: scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json') if not os.path.exists(base_path): @@ -799,7 +801,7 @@ class CogVideoImageEncode: "image": ("IMAGE", ), }, "optional": { - "chunk_size": ("INT", {"default": 16, "min": 1}), + "chunk_size": ("INT", {"default": 16, "min": 4}), "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}), "mask": ("MASK", ), }, @@ -875,6 +877,77 @@ class CogVideoImageEncode: vae.to(offload_device) return ({"samples": final_latents}, ) + +class CogVideoImageInterpolationEncode: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "pipeline": ("COGVIDEOPIPE",), + "start_image": ("IMAGE", ), + "end_image": ("IMAGE", ), + }, + "optional": { + "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}), + "mask": ("MASK", ), + }, + } + + RETURN_TYPES = ("LATENT",) + RETURN_NAMES = ("samples",) + FUNCTION = "encode" + CATEGORY = "CogVideoWrapper" + + def encode(self, pipeline, start_image, end_image, chunk_size=8, enable_tiling=False, mask=None): + device = mm.get_torch_device() + offload_device = mm.unet_offload_device() + generator = torch.Generator(device=device).manual_seed(0) + + B, H, W, C = start_image.shape + + vae = pipeline["pipe"].vae + vae.enable_slicing() + + if enable_tiling: + from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling + enable_vae_encode_tiling(vae) + + if not pipeline["cpu_offloading"]: + vae.to(device) + + check_diffusers_version() + vae._clear_fake_context_parallel_cache() + + if mask is not None: + pipeline["pipe"].original_mask = mask + # print(mask.shape) + # mask = mask.repeat(B, 1, 1) # Shape: [B, H, W] + # mask = mask.unsqueeze(-1).repeat(1, 1, 1, C) + # print(mask.shape) + # input_image = input_image * (1 -mask) + else: + pipeline["pipe"].original_mask = None + + start_image = (start_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W + end_image = (end_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3) + B, T, C, H, W = start_image.shape + + latents_list = [] + + # Encode the chunk of images + start_latents = vae.encode(start_image).latent_dist.sample(generator) * vae.config.scaling_factor + end_latents = vae.encode(end_image).latent_dist.sample(generator) * vae.config.scaling_factor + + start_latents = start_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W + end_latents = end_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W + latents_list = [start_latents, end_latents] + + # Concatenate all the chunks along the temporal dimension + final_latents = torch.cat(latents_list, dim=1) + log.info(f"Encoded latents shape: {final_latents.shape}") + if not pipeline["cpu_offloading"]: + vae.to(offload_device) + + return ({"samples": final_latents}, ) class CogVideoSampler: @classmethod @@ -1500,6 +1573,7 @@ NODE_CLASS_MAPPINGS = { "CogVideoTextEncode": CogVideoTextEncode, "CogVideoDualTextEncode_311": CogVideoDualTextEncode_311, "CogVideoImageEncode": CogVideoImageEncode, + "CogVideoImageInterpolationEncode": CogVideoImageInterpolationEncode, "CogVideoXFunSampler": CogVideoXFunSampler, "CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler, "CogVideoXFunControlSampler": CogVideoXFunControlSampler, @@ -1520,6 +1594,7 @@ NODE_DISPLAY_NAME_MAPPINGS = { "CogVideoTextEncode": "CogVideo TextEncode", "CogVideoDualTextEncode_311": "CogVideo DualTextEncode", "CogVideoImageEncode": "CogVideo ImageEncode", + "CogVideoImageInterpolationEncode": "CogVideo ImageInterpolation Encode", "CogVideoXFunSampler": "CogVideoXFun Sampler", "CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler", "CogVideoXFunControlSampler": "CogVideoXFun Control Sampler", diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index 64208f0..a40d3e4 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -501,15 +501,34 @@ class CogVideoXPipeline(VideoSysPipeline): # 5.5. if image_cond_latents is not None: - padding_shape = ( + if image_cond_latents.shape[1] > 1: + logger.info("More than one image conditioning frame received, interpolating") + padding_shape = ( batch_size, - (latents.shape[1] - 1), + (latents.shape[1] - 2), self.vae.config.latent_channels, height // self.vae_scale_factor_spatial, width // self.vae_scale_factor_spatial, - ) - latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype) - image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1) + ) + print("padding_shape: ", padding_shape) + latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype) + print(image_cond_latents.shape) + print(image_cond_latents[:, 0, :, :, :].shape) + print(image_cond_latents[:, -1, :, :, :].shape) + + image_cond_latents = torch.cat([image_cond_latents[:, 0, :, :, :].unsqueeze(1), latent_padding, image_cond_latents[:, -1, :, :, :].unsqueeze(1)], dim=1) + print("image cond latents shape",image_cond_latents.shape) + else: + logger.info("Only one image conditioning frame received, img2vid") + padding_shape = ( + batch_size, + (latents.shape[1] - 1), + self.vae.config.latent_channels, + height // self.vae_scale_factor_spatial, + width // self.vae_scale_factor_spatial, + ) + latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype) + image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)