diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py
index a713fbf..09eeeec 100644
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@@ -172,7 +172,7 @@ class CogVideoXPipeline(DiffusionPipeline):
             )
         noise = randn_tensor(shape, generator=generator, device=device, dtype=self.vae.dtype)
         if latents is None:
-            latents = noise       
+            latents = noise
         else:
             latents = latents.to(device)
             timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
@@ -538,7 +538,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                             noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                         # compute the previous noisy sample x_t -> x_t-1
-                        latents_tile = self.scheduler.step(noise_pred, t, latents_tile.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0]            
+                        latents_tile = self.scheduler.step(noise_pred, t, latents_tile.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0]
                         latents_all_list.append(latents_tile)
 
                     # ==========================================
@@ -617,6 +617,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                             **extra_step_kwargs,
                             return_dict=False,
                         )
+                    latents = latents.to(prompt_embeds.dtype)
                     # start diff diff
                     if i < len(timesteps) - 1 and self.original_mask is not None:
                         noise_timestep = timesteps[i + 1]