diff --git a/model_loading.py b/model_loading.py index 08ddcee..959c8ff 100644 --- a/model_loading.py +++ b/model_loading.py @@ -258,9 +258,9 @@ class DownloadAndLoadCogVideoModel: #fp8 if fp8_transformer == "enabled" or fp8_transformer == "fastmode": - params_to_keep = {"patch_embed", "lora", "pos_embedding", "time_embedding"} + params_to_keep = {"patch_embed", "lora", "pos_embedding", "time_embedding", "norm_k", "norm_q", "to_k.bias", "to_q.bias", "to_v.bias"} if "1.5" in model: - params_to_keep.update({"norm1.linear.weight", "norm_k", "norm_q","ofs_embedding", "norm_final", "norm_out", "proj_out"}) + params_to_keep.update({"norm1.linear.weight", "ofs_embedding", "norm_final", "norm_out", "proj_out"}) for name, param in pipe.transformer.named_parameters(): if not any(keyword in name for keyword in params_to_keep): param.data = param.data.to(torch.float8_e4m3fn) diff --git a/nodes.py b/nodes.py index 29ffe2c..aa8f6bf 100644 --- a/nodes.py +++ b/nodes.py @@ -854,14 +854,14 @@ class CogVideoSampler: num_frames == 49 or context_options is not None ), "1.0 I2V model can only do 49 frames" - if image_cond_latents is not None: - assert supports_image_conds, "Image condition latents only supported for I2V and Interpolation models" - if "I2V" in model_name: - assert image_cond_latents["samples"].shape[1] == 1, "I2V model only supports single image condition latent" - elif "interpolation" in model_name.lower(): - assert image_cond_latents["samples"].shape[1] == 2, "Interpolation model needs two image condition latents" - else: - assert not supports_image_conds, "Image condition latents required for I2V models" + # if image_cond_latents is not None: + # assert supports_image_conds, "Image condition latents only supported for I2V and Interpolation models" + # if "I2V" in model_name: + # assert image_cond_latents["samples"].shape[1] == 1, "I2V model only supports single image condition latent" + # elif "interpolation" in model_name.lower(): + # assert image_cond_latents["samples"].shape[1] == 2, "Interpolation model needs two image condition latents" + # else: + # assert not supports_image_conds, "Image condition latents required for I2V models" device = mm.get_torch_device() offload_device = mm.unet_offload_device() diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index 694a85e..13c960e 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -519,10 +519,9 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin): ) latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype) image_cond_latents = torch.cat([image_cond_latents[:, 0, :, :, :].unsqueeze(1), latent_padding, image_cond_latents[:, -1, :, :, :].unsqueeze(1)], dim=1) - # Select the first frame along the second dimension if self.transformer.config.patch_size_t is not None: - first_frame = image_cond_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...] - image_cond_latents = torch.cat([first_frame, image_latents], dim=1) + first_frame = image_cond_latents[:, : image_cond_latents.size(1) % self.transformer.config.patch_size_t, ...] + image_cond_latents = torch.cat([first_frame, image_cond_latents], dim=1) logger.info(f"image cond latents shape: {image_cond_latents.shape}") else: @@ -537,6 +536,10 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin): ) latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype) image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1) + # Select the first frame along the second dimension + if self.transformer.config.patch_size_t is not None: + first_frame = image_cond_latents[:, : image_cond_latents.size(1) % self.transformer.config.patch_size_t, ...] + image_cond_latents = torch.cat([first_frame, image_cond_latents], dim=1) else: image_cond_latents = image_cond_latents.repeat(1, latents.shape[1], 1, 1, 1) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline