From 573150de283d727ab7b904ddbc774cbb9e186db1 Mon Sep 17 00:00:00 2001
From: kijai <40791699+kijai@users.noreply.github.com>
Date: Wed, 20 Nov 2024 16:41:34 +0200
Subject: [PATCH] fix Tora when no autocast

---
 pipeline_cogvideox.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py
index c7e4545..59269b2 100644
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@@ -571,7 +571,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
             #    raise ValueError(f"Tora trajectory length {trajectory_length} does not match inpaint_latents count {latents.shape[2]}")
             for module in self.transformer.fuser_list:
                 for param in module.parameters():
-                    param.data = param.data.to(device)
+                    param.data = param.data.to(self.vae_dtype).to(device)
 
         logger.info(f"Sampling {num_frames} frames in {latent_frames} latent frames at {width}x{height} with {num_inference_steps} inference steps")
 
@@ -733,8 +733,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
                     # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                     timestep = t.expand(latent_model_input.shape[0])
 
-                    
-
                     if controlnet is not None:
                         controlnet_states = None
                         if (control_start <= current_step_percentage <= control_end):
@@ -752,7 +750,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
                             else:
                                 controlnet_states = controlnet_states.to(dtype=self.vae_dtype)
 
-
                     # predict noise model_output
                     noise_pred = self.transformer(
                         hidden_states=latent_model_input,