From 47a79dae4eb35a1a32dd498e30aa7dc850c0fcee Mon Sep 17 00:00:00 2001 From: denk Date: Thu, 24 Oct 2024 22:42:09 +0300 Subject: [PATCH] add cogvideox-2b-img2vid --- nodes.py | 18 +++++++++++++----- pipeline_cogvideox.py | 27 +++++++++++++++------------ 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/nodes.py b/nodes.py index 351dc53..683c473 100644 --- a/nodes.py +++ b/nodes.py @@ -269,6 +269,7 @@ class DownloadAndLoadCogVideoModel: "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose", "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose", "feizhengcong/CogvideoX-Interpolation", + "NimVideo/cogvideox-2b-img2vid" ], ), @@ -322,9 +323,14 @@ class DownloadAndLoadCogVideoModel: download_path = base_path elif "2b" in model: - base_path = os.path.join(download_path, "CogVideo2B") - download_path = base_path - repo_id = model + if 'img2vid' in model: + base_path = os.path.join(download_path, "cogvideox-2b-img2vid") + download_path = base_path + repo_id = model + else: + base_path = os.path.join(download_path, "CogVideo2B") + download_path = base_path + repo_id = model else: base_path = os.path.join(download_path, (model.split("/")[-1])) download_path = base_path @@ -399,7 +405,9 @@ class DownloadAndLoadCogVideoModel: pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config) else: vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device) - pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config) + pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config) + if "cogvideox-2b-img2vid" in model: + pipe.input_with_padding = False if enable_sequential_cpu_offload: pipe.enable_sequential_cpu_offload() @@ -1901,4 +1909,4 @@ NODE_DISPLAY_NAME_MAPPINGS = { "ToraEncodeTrajectory": "Tora Encode Trajectory", "ToraEncodeOpticalFlow": "Tora Encode OpticalFlow", "DownloadAndLoadToraModel": "(Down)load Tora Model", - } + } \ No newline at end of file diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index 677402a..d787f33 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -161,10 +161,11 @@ class CogVideoXPipeline(VideoSysPipeline): self.original_mask = original_mask self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial) - if pab_config is not None: set_pab_manager(pab_config) + self.input_with_padding = True + def prepare_latents( self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, timesteps, denoise_strength, num_inference_steps, latents=None, freenoise=True, context_size=None, context_overlap=None @@ -517,16 +518,18 @@ class CogVideoXPipeline(VideoSysPipeline): logger.info(f"image cond latents shape: {image_cond_latents.shape}") else: logger.info("Only one image conditioning frame received, img2vid") - padding_shape = ( - batch_size, - (latents.shape[1] - 1), - self.vae.config.latent_channels, - height // self.vae_scale_factor_spatial, - width // self.vae_scale_factor_spatial, - ) - latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype) - image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1) - + if self.input_with_padding: + padding_shape = ( + batch_size, + (latents.shape[1] - 1), + self.vae.config.latent_channels, + height // self.vae_scale_factor_spatial, + width // self.vae_scale_factor_spatial, + ) + latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype) + image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1) + else: + image_cond_latents = image_cond_latents.repeat(1, latents.shape[1], 1, 1, 1) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -921,4 +924,4 @@ class CogVideoXPipeline(VideoSysPipeline): # Offload all models self.maybe_free_model_hooks() - return latents + return latents \ No newline at end of file