Support CogVideoX-Interpolation -model

https://github.com/feizc/CogvideX-Interpolation
2026-07-09 18:27:02 +08:00 · 2024-10-17 13:41:23 +03:00 · 2024-10-17 13:41:23 +03:00 · 4f8f3aa74f
commit 4f8f3aa74f
parent 09ed641575
3 changed files with 933 additions and 8 deletions
--- a/examples/cogvideox_interpolation_example_01.json
+++ b/examples/cogvideox_interpolation_example_01.json
@ -0,0 +1,831 @@
 {
  "last_node_id": 67,
  "last_link_id": 152,
  "nodes": [
    {
      "id": 20,
      "type": "CLIPLoader",
      "pos": {
        "0": -26,
        "1": 400
      },
      "size": {
        "0": 451.30548095703125,
        "1": 82
      },
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            54,
            56
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CLIPLoader"
      },
      "widgets_values": [
        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
        "sd3"
      ]
    },
    {
      "id": 31,
      "type": "CogVideoTextEncode",
      "pos": {
        "0": 497,
        "1": 520
      },
      "size": {
        "0": 463.01251220703125,
        "1": 124
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 56
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            123
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
        1,
        true
      ]
    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
      "pos": {
        "0": 493,
        "1": 303
      },
      "size": {
        "0": 471.90142822265625,
        "1": 168.08047485351562
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 54
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            122
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
        1,
        true
      ]
    },
    {
      "id": 57,
      "type": "CogVideoSampler",
      "pos": {
        "0": 1138,
        "1": 150
      },
      "size": {
        "0": 399.8780822753906,
        "1": 370
      },
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 121
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 122
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 123
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": null,
          "shape": 7
        },
        {
          "name": "image_cond_latents",
          "type": "LATENT",
          "link": 146,
          "shape": 7
        },
        {
          "name": "context_options",
          "type": "COGCONTEXT",
          "link": null,
          "shape": 7
        },
        {
          "name": "controlnet",
          "type": "COGVIDECONTROLNET",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            128
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            127
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoSampler"
      },
      "widgets_values": [
        480,
        720,
        49,
        20,
        6,
        65334758276105,
        "fixed",
        "CogVideoXDPMScheduler",
        1
      ]
    },
    {
      "id": 1,
      "type": "DownloadAndLoadCogVideoModel",
      "pos": {
        "0": 633,
        "1": 44
      },
      "size": {
        "0": 337.8885192871094,
        "1": 194
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "pab_config",
          "type": "PAB_CONFIG",
          "link": null,
          "shape": 7
        },
        {
          "name": "block_edit",
          "type": "TRANSFORMERBLOCKS",
          "link": null,
          "shape": 7
        },
        {
          "name": "lora",
          "type": "COGLORA",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            121,
            149
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoModel"
      },
      "widgets_values": [
        "feizhengcong/CogvideoX-Interpolation",
        "bf16",
        "disabled",
        "disabled",
        false
      ]
    },
    {
      "id": 65,
      "type": "CogVideoImageInterpolationEncode",
      "pos": {
        "0": 1123,
        "1": 647
      },
      "size": [
        331.6177535935244,
        118
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 149
        },
        {
          "name": "start_image",
          "type": "IMAGE",
          "link": 147
        },
        {
          "name": "end_image",
          "type": "IMAGE",
          "link": 152
        },
        {
          "name": "mask",
          "type": "MASK",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            146
          ],
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoImageInterpolationEncode"
      },
      "widgets_values": [
        false
      ]
    },
    {
      "id": 44,
      "type": "VHS_VideoCombine",
      "pos": {
        "0": 1927,
        "1": 146
      },
      "size": [
        605.3909912109375,
        714.2606608072917
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 118
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "link": null,
          "shape": 7
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null,
          "shape": 7
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_VideoCombine"
      },
      "widgets_values": {
        "frame_rate": 8,
        "loop_count": 0,
        "filename_prefix": "CogVideoX_interpolation",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
        "save_output": false,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "CogVideoX-I2V_00001.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
            "frame_rate": 8
          },
          "muted": false
        }
      }
    },
    {
      "id": 67,
      "type": "ImageResizeKJ",
      "pos": {
        "0": 569,
        "1": 1173
      },
      "size": [
        315,
        266
      ],
      "flags": {
        "collapsed": true
      },
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 151
        },
        {
          "name": "get_image_size",
          "type": "IMAGE",
          "link": 150,
          "shape": 7
        },
        {
          "name": "width_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "width_input"
          },
          "shape": 7
        },
        {
          "name": "height_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "height_input"
          },
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            152
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "width",
          "type": "INT",
          "links": [],
          "slot_index": 1,
          "shape": 3
        },
        {
          "name": "height",
          "type": "INT",
          "links": [],
          "slot_index": 2,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
        720,
        480,
        "lanczos",
        false,
        16,
        0,
        0,
        "disabled"
      ]
    },
    {
      "id": 37,
      "type": "ImageResizeKJ",
      "pos": {
        "0": 537,
        "1": 722
      },
      "size": {
        "0": 315,
        "1": 266
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 71
        },
        {
          "name": "get_image_size",
          "type": "IMAGE",
          "link": null,
          "shape": 7
        },
        {
          "name": "width_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "width_input"
          }
        },
        {
          "name": "height_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "height_input"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            147,
            150
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "width",
          "type": "INT",
          "links": [],
          "slot_index": 1,
          "shape": 3
        },
        {
          "name": "height",
          "type": "INT",
          "links": [],
          "slot_index": 2,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
        720,
        480,
        "lanczos",
        false,
        16,
        0,
        0,
        "disabled"
      ]
    },
    {
      "id": 36,
      "type": "LoadImage",
      "pos": {
        "0": 20,
        "1": 674
      },
      "size": {
        "0": 402.06353759765625,
        "1": 396.6225891113281
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            71
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null,
          "shape": 3
        }
      ],
      "title": "Load Image: Start",
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "sd3stag.png",
        "image"
      ]
    },
    {
      "id": 66,
      "type": "LoadImage",
      "pos": {
        "0": 20,
        "1": 1121
      },
      "size": {
        "0": 402.06353759765625,
        "1": 396.6225891113281
      },
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            151
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null,
          "shape": 3
        }
      ],
      "title": "Load Image: End",
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "sd3stag.png",
        "image"
      ]
    },
    {
      "id": 56,
      "type": "CogVideoDecode",
      "pos": {
        "0": 1581,
        "1": 148
      },
      "size": {
        "0": 300.396484375,
        "1": 198
      },
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 128
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 127
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            118
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      },
      "widgets_values": [
        true,
        240,
        360,
        0.2,
        0.2,
        true
      ]
    }
  ],
  "links": [
    [
      54,
      20,
      0,
      30,
      0,
      "CLIP"
    ],
    [
      56,
      20,
      0,
      31,
      0,
      "CLIP"
    ],
    [
      71,
      36,
      0,
      37,
      0,
      "IMAGE"
    ],
    [
      118,
      56,
      0,
      44,
      0,
      "IMAGE"
    ],
    [
      121,
      1,
      0,
      57,
      0,
      "COGVIDEOPIPE"
    ],
    [
      122,
      30,
      0,
      57,
      1,
      "CONDITIONING"
    ],
    [
      123,
      31,
      0,
      57,
      2,
      "CONDITIONING"
    ],
    [
      127,
      57,
      1,
      56,
      1,
      "LATENT"
    ],
    [
      128,
      57,
      0,
      56,
      0,
      "COGVIDEOPIPE"
    ],
    [
      146,
      65,
      0,
      57,
      4,
      "LATENT"
    ],
    [
      147,
      37,
      0,
      65,
      1,
      "IMAGE"
    ],
    [
      149,
      1,
      0,
      65,
      0,
      "COGVIDEOPIPE"
    ],
    [
      150,
      37,
      0,
      67,
      1,
      "IMAGE"
    ],
    [
      151,
      66,
      0,
      67,
      0,
      "IMAGE"
    ],
    [
      152,
      67,
      0,
      65,
      2,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.693433494944327,
      "offset": [
        225.6761629383604,
        -15.041612364034256
      ]
    }
  },
  "version": 0.4
 }
--- a/nodes.py
+++ b/nodes.py
@ -258,6 +258,7 @@ class DownloadAndLoadCogVideoModel:
                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
                        "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
                        "feizhengcong/CogvideoX-Interpolation",
                    ],
                ),
@ -313,14 +314,15 @@ class DownloadAndLoadCogVideoModel:
            base_path = os.path.join(download_path, "CogVideo2B")
            download_path = base_path
            repo_id = model
-        elif "5b" in model:
+        else:
            base_path = os.path.join(download_path, (model.split("/")[-1]))
            download_path = base_path
            repo_id = model
        if "2b" in model:
            scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
-        elif "5b" in model:
+        else:
            scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
        if not os.path.exists(base_path):
@ -799,7 +801,7 @@ class CogVideoImageEncode:
            "image": ("IMAGE", ),
            },
            "optional": {
-                "chunk_size": ("INT", {"default": 16, "min": 1}),
+                "chunk_size": ("INT", {"default": 16, "min": 4}),
                "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
                "mask": ("MASK", ),
            },
@ -876,6 +878,77 @@ class CogVideoImageEncode:
        return ({"samples": final_latents}, )
 class CogVideoImageInterpolationEncode:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "pipeline": ("COGVIDEOPIPE",),
            "start_image": ("IMAGE", ),
            "end_image": ("IMAGE", ),
            },
            "optional": {
                "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
                "mask": ("MASK", ),
            },
        }
    RETURN_TYPES = ("LATENT",)
    RETURN_NAMES = ("samples",)
    FUNCTION = "encode"
    CATEGORY = "CogVideoWrapper"
    def encode(self, pipeline, start_image, end_image, chunk_size=8, enable_tiling=False, mask=None):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        generator = torch.Generator(device=device).manual_seed(0)
        B, H, W, C = start_image.shape
        vae = pipeline["pipe"].vae
        vae.enable_slicing()
        if enable_tiling:
            from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
            enable_vae_encode_tiling(vae)
        if not pipeline["cpu_offloading"]:
            vae.to(device)
        check_diffusers_version()
        vae._clear_fake_context_parallel_cache()
        if mask is not None:
            pipeline["pipe"].original_mask = mask
            # print(mask.shape)
            # mask = mask.repeat(B, 1, 1)  # Shape: [B, H, W]
            # mask = mask.unsqueeze(-1).repeat(1, 1, 1, C)
            # print(mask.shape)
            # input_image = input_image * (1 -mask)
        else:
            pipeline["pipe"].original_mask = None
        start_image = (start_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
        end_image = (end_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3)
        B, T, C, H, W = start_image.shape
        latents_list = []           
        # Encode the chunk of images
        start_latents = vae.encode(start_image).latent_dist.sample(generator) * vae.config.scaling_factor
        end_latents = vae.encode(end_image).latent_dist.sample(generator) * vae.config.scaling_factor
        start_latents = start_latents.permute(0, 2, 1, 3, 4)  # B, T, C, H, W
        end_latents = end_latents.permute(0, 2, 1, 3, 4)  # B, T, C, H, W
        latents_list = [start_latents, end_latents]
        # Concatenate all the chunks along the temporal dimension
        final_latents = torch.cat(latents_list, dim=1)
        log.info(f"Encoded latents shape: {final_latents.shape}")
        if not pipeline["cpu_offloading"]:
            vae.to(offload_device)
        return ({"samples": final_latents}, )
 class CogVideoSampler:
    @classmethod
    def INPUT_TYPES(s):
@ -1500,6 +1573,7 @@ NODE_CLASS_MAPPINGS = {
    "CogVideoTextEncode": CogVideoTextEncode,
    "CogVideoDualTextEncode_311": CogVideoDualTextEncode_311,
    "CogVideoImageEncode": CogVideoImageEncode,
    "CogVideoImageInterpolationEncode": CogVideoImageInterpolationEncode,
    "CogVideoXFunSampler": CogVideoXFunSampler,
    "CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
    "CogVideoXFunControlSampler": CogVideoXFunControlSampler,
@ -1520,6 +1594,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "CogVideoTextEncode": "CogVideo TextEncode",
    "CogVideoDualTextEncode_311": "CogVideo DualTextEncode",
    "CogVideoImageEncode": "CogVideo ImageEncode",
    "CogVideoImageInterpolationEncode": "CogVideo ImageInterpolation Encode",
    "CogVideoXFunSampler": "CogVideoXFun Sampler",
    "CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
    "CogVideoXFunControlSampler": "CogVideoXFun Control Sampler",
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -501,15 +501,34 @@ class CogVideoXPipeline(VideoSysPipeline):
        # 5.5.
        if image_cond_latents is not None:
-            padding_shape = (
+            if image_cond_latents.shape[1] > 1:
                logger.info("More than one image conditioning frame received, interpolating")
                padding_shape = (
                batch_size,
-                (latents.shape[1] - 1),
+                (latents.shape[1] - 2),
                self.vae.config.latent_channels,
                height // self.vae_scale_factor_spatial,
                width // self.vae_scale_factor_spatial,
-            )
+                )
-            latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
+                print("padding_shape: ", padding_shape)
-            image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
+                latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
                print(image_cond_latents.shape)
                print(image_cond_latents[:, 0, :, :, :].shape)
                print(image_cond_latents[:, -1, :, :, :].shape)
                image_cond_latents = torch.cat([image_cond_latents[:, 0, :, :, :].unsqueeze(1), latent_padding, image_cond_latents[:, -1, :, :, :].unsqueeze(1)], dim=1)
                print("image cond latents shape",image_cond_latents.shape)
            else:
                logger.info("Only one image conditioning frame received, img2vid")
                padding_shape = (
                    batch_size,
                    (latents.shape[1] - 1),
                    self.vae.config.latent_channels,
                    height // self.vae_scale_factor_spatial,
                    width // self.vae_scale_factor_spatial,
                )
                latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
                image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)