fixes

2026-06-29 07:37:02 +08:00 · 2024-10-08 22:47:12 +03:00 · 2024-10-08 22:47:12 +03:00 · ac5daa7148
commit ac5daa7148
parent 032a849bc6
1 changed files with 76 additions and 56 deletions
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -587,6 +587,9 @@ class CogVideoXPipeline(VideoSysPipeline):
            print("Controlnet enabled with weights: ", control_weights)
            control_start = controlnet["control_start"]
            control_end = controlnet["control_end"]
        else:
            controlnet_states = None
            control_weights= None
        # 10. Denoising loop
        with self.progress_bar(total=num_inference_steps) as progress_bar:    
@ -702,19 +705,6 @@ class CogVideoXPipeline(VideoSysPipeline):
                    current_step_percentage = i / num_inference_steps
                    context_queue = list(context(
                        i, num_inference_steps, latents.shape[1], context_frames, context_stride, context_overlap,
                    ))
                    # controlnet frames are not temporally compressed, so try to match the context frames that are
                    control_context_queue = list(context(
                        i, 
                        num_inference_steps, 
                        control_frames.shape[1], 
                        context_frames * self.vae_scale_factor_temporal, 
                        context_stride * self.vae_scale_factor_temporal, 
                        context_overlap * self.vae_scale_factor_temporal,
                    ))
                    # use same rotary embeddings for all context windows
                    image_rotary_emb = (
                            self._prepare_rotary_positional_embeddings(height, width, context_frames, device)
@ -722,40 +712,70 @@ class CogVideoXPipeline(VideoSysPipeline):
                            else None
                        )
-                    for c, control_c in zip(context_queue, control_context_queue):
+                    context_queue = list(context(
-                        partial_latent_model_input = latent_model_input[:, c, :, :, :]
+                        i, num_inference_steps, latents.shape[1], context_frames, context_stride, context_overlap,
-                        partial_control_frames = control_frames[:, control_c, :, :, :]
+                    ))
-                        controlnet_states = None
+                    if controlnet is not None:
-                    
+                        # controlnet frames are not temporally compressed, so try to match the context frames that are
-                        if (control_start <= current_step_percentage <= control_end):
+                        control_context_queue = list(context(
-                            # extract controlnet hidden state
+                            i, 
-                            controlnet_states = self.controlnet(
+                            num_inference_steps, 
                            control_frames.shape[1], 
                            context_frames * self.vae_scale_factor_temporal, 
                            context_stride * self.vae_scale_factor_temporal, 
                            context_overlap * self.vae_scale_factor_temporal,
                        ))
                        for c, control_c in zip(context_queue, control_context_queue):
                            partial_latent_model_input = latent_model_input[:, c, :, :, :]
                            partial_control_frames = control_frames[:, control_c, :, :, :]
                            controlnet_states = None
                            if (control_start <= current_step_percentage <= control_end):
                                # extract controlnet hidden state
                                controlnet_states = self.controlnet(
                                    hidden_states=partial_latent_model_input,
                                    encoder_hidden_states=prompt_embeds,
                                    image_rotary_emb=image_rotary_emb,
                                    controlnet_states=partial_control_frames,
                                    timestep=timestep,
                                    return_dict=False,
                                )[0]
                                if isinstance(controlnet_states, (tuple, list)):
                                    controlnet_states = [x.to(dtype=self.controlnet.dtype) for x in controlnet_states]
                                else:
                                    controlnet_states = controlnet_states.to(dtype=self.controlnet.dtype)
                            # predict noise model_output
                            noise_pred[:, c, :, :, :] += self.transformer(
                                hidden_states=partial_latent_model_input,
                                encoder_hidden_states=prompt_embeds,
                                image_rotary_emb=image_rotary_emb,
                                controlnet_states=partial_control_frames,
                                timestep=timestep,
                                image_rotary_emb=image_rotary_emb,
                                return_dict=False,
                                controlnet_states=controlnet_states,
                                controlnet_weights=control_weights,
                            )[0]
                            if isinstance(controlnet_states, (tuple, list)):
                                controlnet_states = [x.to(dtype=self.controlnet.dtype) for x in controlnet_states]
                            else:
                                controlnet_states = controlnet_states.to(dtype=self.controlnet.dtype)
                        # predict noise model_output
                        noise_pred[:, c, :, :, :] += self.transformer(
                            hidden_states=partial_latent_model_input,
                            encoder_hidden_states=prompt_embeds,
                            timestep=timestep,
                            image_rotary_emb=image_rotary_emb,
                            return_dict=False,
                            controlnet_states=controlnet_states,
                            controlnet_weights=control_weights,
                        )[0]
-                        counter[:, c, :, :, :] += 1
+                            counter[:, c, :, :, :] += 1
-                        noise_pred = noise_pred.float()
+                            noise_pred = noise_pred.float()
                    else:
                        for c in context_queue:
                            partial_latent_model_input = latent_model_input[:, c, :, :, :]
                            # predict noise model_output
                            noise_pred[:, c, :, :, :] += self.transformer(
                                hidden_states=partial_latent_model_input,
                                encoder_hidden_states=prompt_embeds,
                                timestep=timestep,
                                image_rotary_emb=image_rotary_emb,
                                return_dict=False
                            )[0]
                            counter[:, c, :, :, :] += 1
                            noise_pred = noise_pred.float()
                    noise_pred /= counter
                    if do_classifier_free_guidance:
@ -794,23 +814,23 @@ class CogVideoXPipeline(VideoSysPipeline):
                    current_step_percentage = i / num_inference_steps
-                    controlnet_states = None
+                    if controlnet is not None:
-                    if (control_start <= current_step_percentage <= control_end):
+                        controlnet_states = None
-                        # extract controlnet hidden state
+                        if (control_start <= current_step_percentage <= control_end):
-                        controlnet_states = self.controlnet(
+                            # extract controlnet hidden state
-                            hidden_states=latent_model_input,
+                            controlnet_states = self.controlnet(
-                            encoder_hidden_states=prompt_embeds,
+                                hidden_states=latent_model_input,
-                            image_rotary_emb=image_rotary_emb,
+                                encoder_hidden_states=prompt_embeds,
-                            controlnet_states=control_frames,
+                                image_rotary_emb=image_rotary_emb,
-                            timestep=timestep,
+                                controlnet_states=control_frames,
-                            return_dict=False,
+                                timestep=timestep,
-                        )[0]
+                                return_dict=False,
-                        if isinstance(controlnet_states, (tuple, list)):
+                            )[0]
-                            controlnet_states = [x.to(dtype=self.vae.dtype) for x in controlnet_states]
+                            if isinstance(controlnet_states, (tuple, list)):
-                        else:
+                                controlnet_states = [x.to(dtype=self.vae.dtype) for x in controlnet_states]
-                            controlnet_states = controlnet_states.to(dtype=self.vae.dtype)
+                            else:
                                controlnet_states = controlnet_states.to(dtype=self.vae.dtype)                       
                    # predict noise model_output
                    noise_pred = self.transformer(
                        hidden_states=latent_model_input,