Support CogVideoX-Interpolation -model

https://github.com/feizc/CogvideX-Interpolation
2025-12-08 20:34:23 +08:00 · 2024-10-17 13:41:23 +03:00 · 2024-10-17 13:41:23 +03:00 · 4f8f3aa74f
commit 4f8f3aa74f
parent 09ed641575
3 changed files with 933 additions and 8 deletions
--- a/examples/cogvideox_interpolation_example_01.json
+++ b/examples/cogvideox_interpolation_example_01.json
@ -0,0 +1,831 @@
+{
+  "last_node_id": 67,
+  "last_link_id": 152,
+  "nodes": [
+    {
+      "id": 20,
+      "type": "CLIPLoader",
+      "pos": {
+        "0": -26,
+        "1": 400
+      },
+      "size": {
+        "0": 451.30548095703125,
+        "1": 82
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            54,
+            56
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
+        "sd3"
+      ]
+    },
+    {
+      "id": 31,
+      "type": "CogVideoTextEncode",
+      "pos": {
+        "0": 497,
+        "1": 520
+      },
+      "size": {
+        "0": 463.01251220703125,
+        "1": 124
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 56
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            123
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
+        1,
+        true
+      ]
+    },
+    {
+      "id": 30,
+      "type": "CogVideoTextEncode",
+      "pos": {
+        "0": 493,
+        "1": 303
+      },
+      "size": {
+        "0": 471.90142822265625,
+        "1": 168.08047485351562
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 54
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            122
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
+        1,
+        true
+      ]
+    },
+    {
+      "id": 57,
+      "type": "CogVideoSampler",
+      "pos": {
+        "0": 1138,
+        "1": 150
+      },
+      "size": {
+        "0": 399.8780822753906,
+        "1": 370
+      },
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 121
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 122
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 123
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "image_cond_latents",
+          "type": "LATENT",
+          "link": 146,
+          "shape": 7
+        },
+        {
+          "name": "context_options",
+          "type": "COGCONTEXT",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "controlnet",
+          "type": "COGVIDECONTROLNET",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            128
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            127
+          ],
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoSampler"
+      },
+      "widgets_values": [
+        480,
+        720,
+        49,
+        20,
+        6,
+        65334758276105,
+        "fixed",
+        "CogVideoXDPMScheduler",
+        1
+      ]
+    },
+    {
+      "id": 1,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": {
+        "0": 633,
+        "1": 44
+      },
+      "size": {
+        "0": 337.8885192871094,
+        "1": 194
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pab_config",
+          "type": "PAB_CONFIG",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "block_edit",
+          "type": "TRANSFORMERBLOCKS",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "lora",
+          "type": "COGLORA",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            121,
+            149
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "feizhengcong/CogvideoX-Interpolation",
+        "bf16",
+        "disabled",
+        "disabled",
+        false
+      ]
+    },
+    {
+      "id": 65,
+      "type": "CogVideoImageInterpolationEncode",
+      "pos": {
+        "0": 1123,
+        "1": 647
+      },
+      "size": [
+        331.6177535935244,
+        118
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 149
+        },
+        {
+          "name": "start_image",
+          "type": "IMAGE",
+          "link": 147
+        },
+        {
+          "name": "end_image",
+          "type": "IMAGE",
+          "link": 152
+        },
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            146
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoImageInterpolationEncode"
+      },
+      "widgets_values": [
+        false
+      ]
+    },
+    {
+      "id": 44,
+      "type": "VHS_VideoCombine",
+      "pos": {
+        "0": 1927,
+        "1": 146
+      },
+      "size": [
+        605.3909912109375,
+        714.2606608072917
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 118
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 8,
+        "loop_count": 0,
+        "filename_prefix": "CogVideoX_interpolation",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": false,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "CogVideoX-I2V_00001.mp4",
+            "subfolder": "",
+            "type": "temp",
+            "format": "video/h264-mp4",
+            "frame_rate": 8
+          },
+          "muted": false
+        }
+      }
+    },
+    {
+      "id": 67,
+      "type": "ImageResizeKJ",
+      "pos": {
+        "0": 569,
+        "1": 1173
+      },
+      "size": [
+        315,
+        266
+      ],
+      "flags": {
+        "collapsed": true
+      },
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 151
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
+          "link": 150,
+          "shape": 7
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          },
+          "shape": 7
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          },
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            152
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": [],
+          "slot_index": 1,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": [],
+          "slot_index": 2,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
+      "widgets_values": [
+        720,
+        480,
+        "lanczos",
+        false,
+        16,
+        0,
+        0,
+        "disabled"
+      ]
+    },
+    {
+      "id": 37,
+      "type": "ImageResizeKJ",
+      "pos": {
+        "0": 537,
+        "1": 722
+      },
+      "size": {
+        "0": 315,
+        "1": 266
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 71
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          }
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            147,
+            150
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": [],
+          "slot_index": 1,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": [],
+          "slot_index": 2,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
+      "widgets_values": [
+        720,
+        480,
+        "lanczos",
+        false,
+        16,
+        0,
+        0,
+        "disabled"
+      ]
+    },
+    {
+      "id": 36,
+      "type": "LoadImage",
+      "pos": {
+        "0": 20,
+        "1": 674
+      },
+      "size": {
+        "0": 402.06353759765625,
+        "1": 396.6225891113281
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            71
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "title": "Load Image: Start",
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "sd3stag.png",
+        "image"
+      ]
+    },
+    {
+      "id": 66,
+      "type": "LoadImage",
+      "pos": {
+        "0": 20,
+        "1": 1121
+      },
+      "size": {
+        "0": 402.06353759765625,
+        "1": 396.6225891113281
+      },
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            151
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "title": "Load Image: End",
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "sd3stag.png",
+        "image"
+      ]
+    },
+    {
+      "id": 56,
+      "type": "CogVideoDecode",
+      "pos": {
+        "0": 1581,
+        "1": 148
+      },
+      "size": {
+        "0": 300.396484375,
+        "1": 198
+      },
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 128
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 127
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            118
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      },
+      "widgets_values": [
+        true,
+        240,
+        360,
+        0.2,
+        0.2,
+        true
+      ]
+    }
+  ],
+  "links": [
+    [
+      54,
+      20,
+      0,
+      30,
+      0,
+      "CLIP"
+    ],
+    [
+      56,
+      20,
+      0,
+      31,
+      0,
+      "CLIP"
+    ],
+    [
+      71,
+      36,
+      0,
+      37,
+      0,
+      "IMAGE"
+    ],
+    [
+      118,
+      56,
+      0,
+      44,
+      0,
+      "IMAGE"
+    ],
+    [
+      121,
+      1,
+      0,
+      57,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      122,
+      30,
+      0,
+      57,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      123,
+      31,
+      0,
+      57,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      127,
+      57,
+      1,
+      56,
+      1,
+      "LATENT"
+    ],
+    [
+      128,
+      57,
+      0,
+      56,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      146,
+      65,
+      0,
+      57,
+      4,
+      "LATENT"
+    ],
+    [
+      147,
+      37,
+      0,
+      65,
+      1,
+      "IMAGE"
+    ],
+    [
+      149,
+      1,
+      0,
+      65,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      150,
+      37,
+      0,
+      67,
+      1,
+      "IMAGE"
+    ],
+    [
+      151,
+      66,
+      0,
+      67,
+      0,
+      "IMAGE"
+    ],
+    [
+      152,
+      67,
+      0,
+      65,
+      2,
+      "IMAGE"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.693433494944327,
+      "offset": [
+        225.6761629383604,
+        -15.041612364034256
+      ]
+    }
+  },
+  "version": 0.4
+}
--- a/nodes.py
+++ b/nodes.py
@ -258,6 +258,7 @@ class DownloadAndLoadCogVideoModel:
                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
                        "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
+                        "feizhengcong/CogvideoX-Interpolation",
                    ],
                ),

@ -313,14 +314,15 @@ class DownloadAndLoadCogVideoModel:
            base_path = os.path.join(download_path, "CogVideo2B")
            download_path = base_path
            repo_id = model
-        elif "5b" in model:
+        else:
            base_path = os.path.join(download_path, (model.split("/")[-1]))
            download_path = base_path
            repo_id = model
+            

        if "2b" in model:
            scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
-        elif "5b" in model:
+        else:
            scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
        
        if not os.path.exists(base_path):
@ -799,7 +801,7 @@ class CogVideoImageEncode:
            "image": ("IMAGE", ),
            },
            "optional": {
-                "chunk_size": ("INT", {"default": 16, "min": 1}),
+                "chunk_size": ("INT", {"default": 16, "min": 4}),
                "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
                "mask": ("MASK", ),
            },
@ -875,6 +877,77 @@ class CogVideoImageEncode:
            vae.to(offload_device)
        
        return ({"samples": final_latents}, )
+    
+class CogVideoImageInterpolationEncode:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "pipeline": ("COGVIDEOPIPE",),
+            "start_image": ("IMAGE", ),
+            "end_image": ("IMAGE", ),
+            },
+            "optional": {
+                "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
+                "mask": ("MASK", ),
+            },
+        }
+
+    RETURN_TYPES = ("LATENT",)
+    RETURN_NAMES = ("samples",)
+    FUNCTION = "encode"
+    CATEGORY = "CogVideoWrapper"
+
+    def encode(self, pipeline, start_image, end_image, chunk_size=8, enable_tiling=False, mask=None):
+        device = mm.get_torch_device()
+        offload_device = mm.unet_offload_device()
+        generator = torch.Generator(device=device).manual_seed(0)
+
+        B, H, W, C = start_image.shape
+
+        vae = pipeline["pipe"].vae
+        vae.enable_slicing()
+        
+        if enable_tiling:
+            from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
+            enable_vae_encode_tiling(vae)
+
+        if not pipeline["cpu_offloading"]:
+            vae.to(device)
+
+        check_diffusers_version()
+        vae._clear_fake_context_parallel_cache()
+        
+        if mask is not None:
+            pipeline["pipe"].original_mask = mask
+            # print(mask.shape)
+            # mask = mask.repeat(B, 1, 1)  # Shape: [B, H, W]
+            # mask = mask.unsqueeze(-1).repeat(1, 1, 1, C)
+            # print(mask.shape)
+            # input_image = input_image * (1 -mask)
+        else:
+            pipeline["pipe"].original_mask = None
+            
+        start_image = (start_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
+        end_image = (end_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3)
+        B, T, C, H, W = start_image.shape
+
+        latents_list = []           
+
+        # Encode the chunk of images
+        start_latents = vae.encode(start_image).latent_dist.sample(generator) * vae.config.scaling_factor
+        end_latents = vae.encode(end_image).latent_dist.sample(generator) * vae.config.scaling_factor
+
+        start_latents = start_latents.permute(0, 2, 1, 3, 4)  # B, T, C, H, W
+        end_latents = end_latents.permute(0, 2, 1, 3, 4)  # B, T, C, H, W
+        latents_list = [start_latents, end_latents]
+
+        # Concatenate all the chunks along the temporal dimension
+        final_latents = torch.cat(latents_list, dim=1)
+        log.info(f"Encoded latents shape: {final_latents.shape}")
+        if not pipeline["cpu_offloading"]:
+            vae.to(offload_device)
+        
+        return ({"samples": final_latents}, )

 class CogVideoSampler:
    @classmethod
@ -1500,6 +1573,7 @@ NODE_CLASS_MAPPINGS = {
    "CogVideoTextEncode": CogVideoTextEncode,
    "CogVideoDualTextEncode_311": CogVideoDualTextEncode_311,
    "CogVideoImageEncode": CogVideoImageEncode,
+    "CogVideoImageInterpolationEncode": CogVideoImageInterpolationEncode,
    "CogVideoXFunSampler": CogVideoXFunSampler,
    "CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
    "CogVideoXFunControlSampler": CogVideoXFunControlSampler,
@ -1520,6 +1594,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "CogVideoTextEncode": "CogVideo TextEncode",
    "CogVideoDualTextEncode_311": "CogVideo DualTextEncode",
    "CogVideoImageEncode": "CogVideo ImageEncode",
+    "CogVideoImageInterpolationEncode": "CogVideo ImageInterpolation Encode",
    "CogVideoXFunSampler": "CogVideoXFun Sampler",
    "CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
    "CogVideoXFunControlSampler": "CogVideoXFun Control Sampler",
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -501,15 +501,34 @@ class CogVideoXPipeline(VideoSysPipeline):

        # 5.5.
        if image_cond_latents is not None:
-            padding_shape = (
+            if image_cond_latents.shape[1] > 1:
+                logger.info("More than one image conditioning frame received, interpolating")
+                padding_shape = (
                batch_size,
-                (latents.shape[1] - 1),
+                (latents.shape[1] - 2),
                self.vae.config.latent_channels,
                height // self.vae_scale_factor_spatial,
                width // self.vae_scale_factor_spatial,
-            )
-            latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
-            image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
+                )
+                print("padding_shape: ", padding_shape)
+                latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
+                print(image_cond_latents.shape)
+                print(image_cond_latents[:, 0, :, :, :].shape)
+                print(image_cond_latents[:, -1, :, :, :].shape)
+
+                image_cond_latents = torch.cat([image_cond_latents[:, 0, :, :, :].unsqueeze(1), latent_padding, image_cond_latents[:, -1, :, :, :].unsqueeze(1)], dim=1)
+                print("image cond latents shape",image_cond_latents.shape)
+            else:
+                logger.info("Only one image conditioning frame received, img2vid")
+                padding_shape = (
+                    batch_size,
+                    (latents.shape[1] - 1),
+                    self.vae.config.latent_channels,
+                    height // self.vae_scale_factor_spatial,
+                    width // self.vae_scale_factor_spatial,
+                )
+                latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
+                image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
       
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)