diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index 2abd11f..925f6a2 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -542,6 +542,8 @@ class CogVideoXPipeline(VideoSysPipeline): use_temporal_tiling = True print("Temporal tiling enabled") elif context_schedule is not None: + if image_cond_latents is not None: + raise NotImplementedError("Context schedule not currently supported with image conditioning") print(f"Context schedule enabled: {context_frames} frames, {context_stride} stride, {context_overlap} overlap") use_temporal_tiling = False use_context_schedule = True @@ -684,7 +686,6 @@ class CogVideoXPipeline(VideoSysPipeline): for c in context_queue: partial_latent_model_input = latent_model_input[:, c, :, :, :] - # predict noise model_output noise_pred[:, c, :, :, :] += self.transformer( hidden_states=partial_latent_model_input, @@ -729,6 +730,7 @@ class CogVideoXPipeline(VideoSysPipeline): if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() + comfy_pbar.update(1) else: latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents @@ -740,7 +742,6 @@ class CogVideoXPipeline(VideoSysPipeline): # broadcast to batch dimension in a way that's compatible with ONNX/Core ML timestep = t.expand(latent_model_input.shape[0]) - # predict noise model_output noise_pred = self.transformer( hidden_states=latent_model_input,