add cogvideox-2b-img2vid

2026-05-17 14:19:11 +08:00 · 2024-10-24 22:42:09 +03:00 · 2024-10-24 22:42:09 +03:00 · 47a79dae4e
commit 47a79dae4e
parent 9e488568b2
2 changed files with 28 additions and 17 deletions
--- a/nodes.py
+++ b/nodes.py
@ -269,6 +269,7 @@ class DownloadAndLoadCogVideoModel:
                        "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
                        "feizhengcong/CogvideoX-Interpolation",
                        "NimVideo/cogvideox-2b-img2vid"
                    ],
                ),
@ -322,6 +323,11 @@ class DownloadAndLoadCogVideoModel:
                download_path = base_path
        elif "2b" in model:
            if 'img2vid' in model:
                base_path = os.path.join(download_path, "cogvideox-2b-img2vid")
                download_path = base_path
                repo_id = model
            else:
                base_path = os.path.join(download_path, "CogVideo2B")
                download_path = base_path
                repo_id = model
@ -400,6 +406,8 @@ class DownloadAndLoadCogVideoModel:
        else:
            vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
            pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)
            if "cogvideox-2b-img2vid" in model:
                pipe.input_with_padding = False 
        if enable_sequential_cpu_offload:
            pipe.enable_sequential_cpu_offload()
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -161,10 +161,11 @@ class CogVideoXPipeline(VideoSysPipeline):
        self.original_mask = original_mask
        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
        if pab_config is not None:
            set_pab_manager(pab_config)
        self.input_with_padding = True
    def prepare_latents(
        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, timesteps, denoise_strength,
         num_inference_steps, latents=None, freenoise=True, context_size=None, context_overlap=None
@ -517,6 +518,7 @@ class CogVideoXPipeline(VideoSysPipeline):
                logger.info(f"image cond latents shape: {image_cond_latents.shape}")
            else:
                logger.info("Only one image conditioning frame received, img2vid")
                if self.input_with_padding:
                    padding_shape = (
                        batch_size,
                        (latents.shape[1] - 1),
@ -526,7 +528,8 @@ class CogVideoXPipeline(VideoSysPipeline):
                    )
                    latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
                    image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
-       
+                else:
                    image_cond_latents = image_cond_latents.repeat(1, latents.shape[1], 1, 1, 1)
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)