fixes

2025-12-09 04:44:22 +08:00 · 2024-11-12 11:29:50 +02:00 · 2024-11-12 11:29:50 +02:00 · ba2dbfbeb4
commit ba2dbfbeb4
parent dac6a2a3ac
3 changed files with 16 additions and 13 deletions
--- a/model_loading.py
+++ b/model_loading.py
@ -258,9 +258,9 @@ class DownloadAndLoadCogVideoModel:
        #fp8
        if fp8_transformer == "enabled" or fp8_transformer == "fastmode":
-            params_to_keep = {"patch_embed", "lora", "pos_embedding", "time_embedding"}
+            params_to_keep = {"patch_embed", "lora", "pos_embedding", "time_embedding", "norm_k", "norm_q", "to_k.bias", "to_q.bias", "to_v.bias"}
            if "1.5" in model:
-                    params_to_keep.update({"norm1.linear.weight", "norm_k", "norm_q","ofs_embedding", "norm_final", "norm_out", "proj_out"}) 
+                    params_to_keep.update({"norm1.linear.weight", "ofs_embedding", "norm_final", "norm_out", "proj_out"}) 
            for name, param in pipe.transformer.named_parameters():
                if not any(keyword in name for keyword in params_to_keep):
                    param.data = param.data.to(torch.float8_e4m3fn)
--- a/nodes.py
+++ b/nodes.py
@ -854,14 +854,14 @@ class CogVideoSampler:
            num_frames == 49 or
            context_options is not None
        ), "1.0 I2V model can only do 49 frames"
-        if image_cond_latents is not None:
+        # if image_cond_latents is not None:
-            assert supports_image_conds, "Image condition latents only supported for I2V and Interpolation models"
+        #     assert supports_image_conds, "Image condition latents only supported for I2V and Interpolation models"
-            if "I2V" in model_name:
+        #     if "I2V" in model_name:
-                assert image_cond_latents["samples"].shape[1] == 1, "I2V model only supports single image condition latent"
+        #         assert image_cond_latents["samples"].shape[1] == 1, "I2V model only supports single image condition latent"
-            elif "interpolation" in model_name.lower():
+        #     elif "interpolation" in model_name.lower():
-                assert image_cond_latents["samples"].shape[1] == 2, "Interpolation model needs two image condition latents"
+        #         assert image_cond_latents["samples"].shape[1] == 2, "Interpolation model needs two image condition latents"
-        else:
+        # else:
-            assert not supports_image_conds, "Image condition latents required for I2V models"
+        #     assert not supports_image_conds, "Image condition latents required for I2V models"
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -519,10 +519,9 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin):
                )
                latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
                image_cond_latents = torch.cat([image_cond_latents[:, 0, :, :, :].unsqueeze(1), latent_padding, image_cond_latents[:, -1, :, :, :].unsqueeze(1)], dim=1)
                # Select the first frame along the second dimension
                if self.transformer.config.patch_size_t is not None:
-                    first_frame = image_cond_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
+                        first_frame = image_cond_latents[:, : image_cond_latents.size(1) % self.transformer.config.patch_size_t, ...]
-                    image_cond_latents = torch.cat([first_frame, image_latents], dim=1)
+                        image_cond_latents = torch.cat([first_frame, image_cond_latents], dim=1)
                logger.info(f"image cond latents shape: {image_cond_latents.shape}")
            else:
@ -537,6 +536,10 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin):
                    )
                    latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
                    image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
                    # Select the first frame along the second dimension
                    if self.transformer.config.patch_size_t is not None:
                        first_frame = image_cond_latents[:, : image_cond_latents.size(1) % self.transformer.config.patch_size_t, ...]
                        image_cond_latents = torch.cat([first_frame, image_cond_latents], dim=1)
                else:
                    image_cond_latents = image_cond_latents.repeat(1, latents.shape[1], 1, 1, 1)
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline