Merge pull request #193 from Nim-Video/main

Add 2b img2vid cogvideox
This commit is contained in:
Jukka Seppänen 2024-10-28 22:38:40 +09:00 committed by GitHub
commit 750deb3918
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 17 deletions

View File

@ -269,6 +269,7 @@ class DownloadAndLoadCogVideoModel:
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
"feizhengcong/CogvideoX-Interpolation",
"NimVideo/cogvideox-2b-img2vid"
],
),
@ -322,9 +323,14 @@ class DownloadAndLoadCogVideoModel:
download_path = base_path
elif "2b" in model:
base_path = os.path.join(download_path, "CogVideo2B")
download_path = base_path
repo_id = model
if 'img2vid' in model:
base_path = os.path.join(download_path, "cogvideox-2b-img2vid")
download_path = base_path
repo_id = model
else:
base_path = os.path.join(download_path, "CogVideo2B")
download_path = base_path
repo_id = model
else:
base_path = os.path.join(download_path, (model.split("/")[-1]))
download_path = base_path
@ -400,6 +406,8 @@ class DownloadAndLoadCogVideoModel:
else:
vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)
if "cogvideox-2b-img2vid" in model:
pipe.input_with_padding = False
if enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload()

View File

@ -161,10 +161,11 @@ class CogVideoXPipeline(VideoSysPipeline):
self.original_mask = original_mask
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
if pab_config is not None:
set_pab_manager(pab_config)
self.input_with_padding = True
def prepare_latents(
self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, timesteps, denoise_strength,
num_inference_steps, latents=None, freenoise=True, context_size=None, context_overlap=None
@ -517,16 +518,18 @@ class CogVideoXPipeline(VideoSysPipeline):
logger.info(f"image cond latents shape: {image_cond_latents.shape}")
else:
logger.info("Only one image conditioning frame received, img2vid")
padding_shape = (
batch_size,
(latents.shape[1] - 1),
self.vae.config.latent_channels,
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
)
latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
if self.input_with_padding:
padding_shape = (
batch_size,
(latents.shape[1] - 1),
self.vae.config.latent_channels,
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
)
latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
else:
image_cond_latents = image_cond_latents.repeat(1, latents.shape[1], 1, 1, 1)
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)