From 47a79dae4eb35a1a32dd498e30aa7dc850c0fcee Mon Sep 17 00:00:00 2001
From: denk <komedian@bk.ru>
Date: Thu, 24 Oct 2024 22:42:09 +0300
Subject: [PATCH] add cogvideox-2b-img2vid

---
 nodes.py              | 18 +++++++++++++-----
 pipeline_cogvideox.py | 27 +++++++++++++++------------
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/nodes.py b/nodes.py
index 351dc53..683c473 100644
--- a/nodes.py
+++ b/nodes.py
@@ -269,6 +269,7 @@ class DownloadAndLoadCogVideoModel:
                         "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
                         "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
                         "feizhengcong/CogvideoX-Interpolation",
+                        "NimVideo/cogvideox-2b-img2vid"
                     ],
                 ),
 
@@ -322,9 +323,14 @@ class DownloadAndLoadCogVideoModel:
                 download_path = base_path
 
         elif "2b" in model:
-            base_path = os.path.join(download_path, "CogVideo2B")
-            download_path = base_path
-            repo_id = model
+            if 'img2vid' in model:
+                base_path = os.path.join(download_path, "cogvideox-2b-img2vid")
+                download_path = base_path
+                repo_id = model
+            else:
+                base_path = os.path.join(download_path, "CogVideo2B")
+                download_path = base_path
+                repo_id = model
         else:
             base_path = os.path.join(download_path, (model.split("/")[-1]))
             download_path = base_path
@@ -399,7 +405,9 @@ class DownloadAndLoadCogVideoModel:
                 pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config)
         else:
             vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
-            pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)        
+            pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)
+            if "cogvideox-2b-img2vid" in model:
+                pipe.input_with_padding = False 
 
         if enable_sequential_cpu_offload:
             pipe.enable_sequential_cpu_offload()
@@ -1901,4 +1909,4 @@ NODE_DISPLAY_NAME_MAPPINGS = {
     "ToraEncodeTrajectory": "Tora Encode Trajectory",
     "ToraEncodeOpticalFlow": "Tora Encode OpticalFlow",
     "DownloadAndLoadToraModel": "(Down)load Tora Model",
-    }
+    }
\ No newline at end of file
diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py
index 677402a..d787f33 100644
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@@ -161,10 +161,11 @@ class CogVideoXPipeline(VideoSysPipeline):
         self.original_mask = original_mask
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
 
-
         if pab_config is not None:
             set_pab_manager(pab_config)
 
+        self.input_with_padding = True
+
     def prepare_latents(
         self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, timesteps, denoise_strength,
          num_inference_steps, latents=None, freenoise=True, context_size=None, context_overlap=None
@@ -517,16 +518,18 @@ class CogVideoXPipeline(VideoSysPipeline):
                 logger.info(f"image cond latents shape: {image_cond_latents.shape}")
             else:
                 logger.info("Only one image conditioning frame received, img2vid")
-                padding_shape = (
-                    batch_size,
-                    (latents.shape[1] - 1),
-                    self.vae.config.latent_channels,
-                    height // self.vae_scale_factor_spatial,
-                    width // self.vae_scale_factor_spatial,
-                )
-                latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
-                image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
-       
+                if self.input_with_padding:
+                    padding_shape = (
+                        batch_size,
+                        (latents.shape[1] - 1),
+                        self.vae.config.latent_channels,
+                        height // self.vae_scale_factor_spatial,
+                        width // self.vae_scale_factor_spatial,
+                    )
+                    latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
+                    image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
+                else:
+                    image_cond_latents = image_cond_latents.repeat(1, latents.shape[1], 1, 1, 1)
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
@@ -921,4 +924,4 @@ class CogVideoXPipeline(VideoSysPipeline):
         # Offload all models
         self.maybe_free_model_hooks()
 
-        return latents
+        return latents
\ No newline at end of file